WIP: refactor: iteratively crawl, fetch and build document tree #3

Draft
marc wants to merge 1 commit from feat/support-image-attachments into main
2 changed files with 35 additions and 18 deletions

View file

@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):
return cls(**kwargs, source_locator=pathlib.Path(locator), type="document") return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
@property
def is_collection(self) -> bool:
return self.type == "collection"
@property @property
@abc.abstractmethod @abc.abstractmethod
def metadata(self) -> dict[str, object]: def metadata(self) -> dict[str, object]:

View file

@ -1,6 +1,7 @@
import httpx import httpx
import frontmatter import frontmatter
import re
import pathlib import pathlib
import functools import functools
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
) )
def get_document_tree(self, root: str) -> NextCloudDocument: def get_document_tree(self, root: str) -> NextCloudDocument:
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}" root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"
request = httpx.Request("PROPFIND", root_locator)
with self._client as client: fetch_queue = [(root_locator, 0)]
response = client.send(request) doc_stack = []
root_properties = ET.fromstring(response.text) while fetch_queue:
current, depth = fetch_queue.pop()
root_document = NextCloudDocument.collection(locator=root_locator) if len(doc_stack) > depth:
doc_stack.pop()
for r in root_properties: current_document = self.get_document(current)
doc_path = r.find("{DAV:}href")
if doc_path is not None: if not doc_stack or current_document.is_collection:
doc_path = doc_path.text doc_stack.append(current_document)
if doc_path == root_document.source_locator: if current_document.is_collection:
for child in ET.fromstring(current_document.raw):
path_match = child.find("{DAV:}href")
path = (
f"{self.base_url}{path_match.text}"
if path_match is not None
else ""
)
if path == current or not path:
continue continue
full_document_path = f"{self.base_url}/{doc_path}" fetch_queue.append((path, depth + 1))
root_document.children.append(self.get_document(full_document_path))
return root_document else:
doc_stack[-1].children.append(current_document)
return doc_stack[0]
def get_document(self, path: str) -> NextCloudDocument: def get_document(self, path: str) -> NextCloudDocument:
p = pathlib.Path(path) p = pathlib.Path(path)
is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)
with self._client as client: with self._client as client:
verb = "PROPFIND" if not p.suffix else "GET" verb = "PROPFIND" if not is_doc else "GET"
request = httpx.Request(verb, path) request = httpx.Request(verb, path)
response = client.send(request) response = client.send(request)
if p.suffix: if is_doc:
return NextCloudDocument.doc(locator=path, raw=response.text) return NextCloudDocument.doc(locator=path, raw=response.text)
else: else:
return NextCloudDocument.collection(locator=path) return NextCloudDocument.collection(locator=path, raw=response.text)