WIP: refactor: iteratively crawl, fetch and build document tree #3

Draft
marc wants to merge 1 commit from feat/support-image-attachments into main
2 changed files with 35 additions and 18 deletions
Showing only changes of commit db7ecf0ba2 - Show all commits

View file

@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):
return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
@property
def is_collection(self) -> bool:
return self.type == "collection"
@property
@abc.abstractmethod
def metadata(self) -> dict[str, object]:

View file

@ -1,6 +1,7 @@
import httpx
import frontmatter
import re
import pathlib
import functools
import xml.etree.ElementTree as ET
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
)
def get_document_tree(self, root: str) -> NextCloudDocument:
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}"
request = httpx.Request("PROPFIND", root_locator)
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"
with self._client as client:
response = client.send(request)
fetch_queue = [(root_locator, 0)]
doc_stack = []
root_properties = ET.fromstring(response.text)
while fetch_queue:
current, depth = fetch_queue.pop()
root_document = NextCloudDocument.collection(locator=root_locator)
if len(doc_stack) > depth:
doc_stack.pop()
for r in root_properties:
doc_path = r.find("{DAV:}href")
current_document = self.get_document(current)
if doc_path is not None:
doc_path = doc_path.text
if not doc_stack or current_document.is_collection:
doc_stack.append(current_document)
if doc_path == root_document.source_locator:
continue
if current_document.is_collection:
for child in ET.fromstring(current_document.raw):
path_match = child.find("{DAV:}href")
path = (
f"{self.base_url}{path_match.text}"
if path_match is not None
else ""
)
full_document_path = f"{self.base_url}/{doc_path}"
root_document.children.append(self.get_document(full_document_path))
if path == current or not path:
continue
return root_document
fetch_queue.append((path, depth + 1))
else:
doc_stack[-1].children.append(current_document)
return doc_stack[0]
def get_document(self, path: str) -> NextCloudDocument:
p = pathlib.Path(path)
is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)
with self._client as client:
verb = "PROPFIND" if not p.suffix else "GET"
verb = "PROPFIND" if not is_doc else "GET"
request = httpx.Request(verb, path)
response = client.send(request)
if p.suffix:
if is_doc:
return NextCloudDocument.doc(locator=path, raw=response.text)
else:
return NextCloudDocument.collection(locator=path)
return NextCloudDocument.collection(locator=path, raw=response.text)