refactor: iteratively crawl, fetch and build document tree

2024-11-19 00:12:47 -05:00 · 2024-11-19 00:12:47 -05:00 · db7ecf0ba2
commit db7ecf0ba2
parent d0a7adaf5a
2 changed files with 35 additions and 18 deletions
--- a/blogue/models.py
+++ b/blogue/models.py
@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):
        return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
    @property
    def is_collection(self) -> bool:
        return self.type == "collection"
    @property
    @abc.abstractmethod
    def metadata(self) -> dict[str, object]:
--- a/blogue/nextcloud.py
+++ b/blogue/nextcloud.py
@ -1,6 +1,7 @@
 import httpx
 import frontmatter
 import re
 import pathlib
 import functools
 import xml.etree.ElementTree as ET
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
        )
    def get_document_tree(self, root: str) -> NextCloudDocument:
-        root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}"
+        root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"
        request = httpx.Request("PROPFIND", root_locator)
-        with self._client as client:
+        fetch_queue = [(root_locator, 0)]
-            response = client.send(request)
+        doc_stack = []
-        root_properties = ET.fromstring(response.text)
+        while fetch_queue:
            current, depth = fetch_queue.pop()
-        root_document = NextCloudDocument.collection(locator=root_locator)
+            if len(doc_stack) > depth:
                doc_stack.pop()
-        for r in root_properties:
+            current_document = self.get_document(current)
            doc_path = r.find("{DAV:}href")
-            if doc_path is not None:
+            if not doc_stack or current_document.is_collection:
-                doc_path = doc_path.text
+                doc_stack.append(current_document)
-            if doc_path == root_document.source_locator:
+            if current_document.is_collection:
                for child in ET.fromstring(current_document.raw):
                    path_match = child.find("{DAV:}href")
                    path = (
                        f"{self.base_url}{path_match.text}"
                        if path_match is not None
                        else ""
                    )
                    if path == current or not path:
                        continue
-            full_document_path = f"{self.base_url}/{doc_path}"
+                    fetch_queue.append((path, depth + 1))
            root_document.children.append(self.get_document(full_document_path))
-        return root_document
+            else:
                doc_stack[-1].children.append(current_document)
        return doc_stack[0]
    def get_document(self, path: str) -> NextCloudDocument:
        p = pathlib.Path(path)
        is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)
        with self._client as client:
-            verb = "PROPFIND" if not p.suffix else "GET"
+            verb = "PROPFIND" if not is_doc else "GET"
            request = httpx.Request(verb, path)
            response = client.send(request)
-        if p.suffix:
+        if is_doc:
            return NextCloudDocument.doc(locator=path, raw=response.text)
        else:
-            return NextCloudDocument.collection(locator=path)
+            return NextCloudDocument.collection(locator=path, raw=response.text)