refactor: iteratively crawl, fetch and build document tree

2024-11-19 00:12:47 -05:00 · 2024-11-19 00:12:47 -05:00 · db7ecf0ba2
commit db7ecf0ba2
parent d0a7adaf5a
2 changed files with 35 additions and 18 deletions
--- a/blogue/models.py
+++ b/blogue/models.py
@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):

        return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")

+    @property
+    def is_collection(self) -> bool:
+        return self.type == "collection"
+
    @property
    @abc.abstractmethod
    def metadata(self) -> dict[str, object]:
--- a/blogue/nextcloud.py
+++ b/blogue/nextcloud.py
@ -1,6 +1,7 @@
 import httpx
 import frontmatter

+import re
 import pathlib
 import functools
 import xml.etree.ElementTree as ET
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
        )

    def get_document_tree(self, root: str) -> NextCloudDocument:
-        root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}"
-        request = httpx.Request("PROPFIND", root_locator)
+        root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"

-        with self._client as client:
-            response = client.send(request)
+        fetch_queue = [(root_locator, 0)]
+        doc_stack = []

-        root_properties = ET.fromstring(response.text)
+        while fetch_queue:
+            current, depth = fetch_queue.pop()

-        root_document = NextCloudDocument.collection(locator=root_locator)
+            if len(doc_stack) > depth:
+                doc_stack.pop()

-        for r in root_properties:
-            doc_path = r.find("{DAV:}href")
+            current_document = self.get_document(current)

-            if doc_path is not None:
-                doc_path = doc_path.text
+            if not doc_stack or current_document.is_collection:
+                doc_stack.append(current_document)

-            if doc_path == root_document.source_locator:
-                continue
+            if current_document.is_collection:
+                for child in ET.fromstring(current_document.raw):
+                    path_match = child.find("{DAV:}href")
+                    path = (
+                        f"{self.base_url}{path_match.text}"
+                        if path_match is not None
+                        else ""
+                    )

-            full_document_path = f"{self.base_url}/{doc_path}"
-            root_document.children.append(self.get_document(full_document_path))
+                    if path == current or not path:
+                        continue

-        return root_document
+                    fetch_queue.append((path, depth + 1))
+
+            else:
+                doc_stack[-1].children.append(current_document)
+
+        return doc_stack[0]

    def get_document(self, path: str) -> NextCloudDocument:
        p = pathlib.Path(path)
+        is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)

        with self._client as client:
-            verb = "PROPFIND" if not p.suffix else "GET"
+            verb = "PROPFIND" if not is_doc else "GET"
            request = httpx.Request(verb, path)
            response = client.send(request)

-        if p.suffix:
+        if is_doc:
            return NextCloudDocument.doc(locator=path, raw=response.text)
        else:
-            return NextCloudDocument.collection(locator=path)
+            return NextCloudDocument.collection(locator=path, raw=response.text)