From db7ecf0ba2d676a22f6ee62eb4e0b6459b072f74 Mon Sep 17 00:00:00 2001 From: Marc Cataford Date: Tue, 19 Nov 2024 00:12:47 -0500 Subject: [PATCH] refactor: iteratively crawl, fetch and build document tree --- blogue/models.py | 4 ++++ blogue/nextcloud.py | 49 ++++++++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/blogue/models.py b/blogue/models.py index 02fe891..1c07d1f 100644 --- a/blogue/models.py +++ b/blogue/models.py @@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC): return cls(**kwargs, source_locator=pathlib.Path(locator), type="document") + @property + def is_collection(self) -> bool: + return self.type == "collection" + @property @abc.abstractmethod def metadata(self) -> dict[str, object]: diff --git a/blogue/nextcloud.py b/blogue/nextcloud.py index 4e6f6a0..a6a2caf 100644 --- a/blogue/nextcloud.py +++ b/blogue/nextcloud.py @@ -1,6 +1,7 @@ import httpx import frontmatter +import re import pathlib import functools import xml.etree.ElementTree as ET @@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True): ) def get_document_tree(self, root: str) -> NextCloudDocument: - root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}" - request = httpx.Request("PROPFIND", root_locator) + root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/" - with self._client as client: - response = client.send(request) + fetch_queue = [(root_locator, 0)] + doc_stack = [] - root_properties = ET.fromstring(response.text) + while fetch_queue: + current, depth = fetch_queue.pop() - root_document = NextCloudDocument.collection(locator=root_locator) + if len(doc_stack) > depth: + doc_stack.pop() - for r in root_properties: - doc_path = r.find("{DAV:}href") + current_document = self.get_document(current) - if doc_path is not None: - doc_path = doc_path.text + if not doc_stack or current_document.is_collection: + doc_stack.append(current_document) - if doc_path == root_document.source_locator: - continue + if current_document.is_collection: + for child in ET.fromstring(current_document.raw): + path_match = child.find("{DAV:}href") + path = ( + f"{self.base_url}{path_match.text}" + if path_match is not None + else "" + ) - full_document_path = f"{self.base_url}/{doc_path}" - root_document.children.append(self.get_document(full_document_path)) + if path == current or not path: + continue - return root_document + fetch_queue.append((path, depth + 1)) + + else: + doc_stack[-1].children.append(current_document) + + return doc_stack[0] def get_document(self, path: str) -> NextCloudDocument: p = pathlib.Path(path) + is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix) with self._client as client: - verb = "PROPFIND" if not p.suffix else "GET" + verb = "PROPFIND" if not is_doc else "GET" request = httpx.Request(verb, path) response = client.send(request) - if p.suffix: + if is_doc: return NextCloudDocument.doc(locator=path, raw=response.text) else: - return NextCloudDocument.collection(locator=path) + return NextCloudDocument.collection(locator=path, raw=response.text) -- 2.45.2