refactor: iteratively crawl, fetch and build document tree
This commit is contained in:
parent
d0a7adaf5a
commit
db7ecf0ba2
2 changed files with 35 additions and 18 deletions
|
@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):
|
|||
|
||||
return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
|
||||
|
||||
@property
|
||||
def is_collection(self) -> bool:
|
||||
return self.type == "collection"
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def metadata(self) -> dict[str, object]:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import httpx
|
||||
import frontmatter
|
||||
|
||||
import re
|
||||
import pathlib
|
||||
import functools
|
||||
import xml.etree.ElementTree as ET
|
||||
|
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
|
|||
)
|
||||
|
||||
def get_document_tree(self, root: str) -> NextCloudDocument:
|
||||
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}"
|
||||
request = httpx.Request("PROPFIND", root_locator)
|
||||
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"
|
||||
|
||||
with self._client as client:
|
||||
response = client.send(request)
|
||||
fetch_queue = [(root_locator, 0)]
|
||||
doc_stack = []
|
||||
|
||||
root_properties = ET.fromstring(response.text)
|
||||
while fetch_queue:
|
||||
current, depth = fetch_queue.pop()
|
||||
|
||||
root_document = NextCloudDocument.collection(locator=root_locator)
|
||||
if len(doc_stack) > depth:
|
||||
doc_stack.pop()
|
||||
|
||||
for r in root_properties:
|
||||
doc_path = r.find("{DAV:}href")
|
||||
current_document = self.get_document(current)
|
||||
|
||||
if doc_path is not None:
|
||||
doc_path = doc_path.text
|
||||
if not doc_stack or current_document.is_collection:
|
||||
doc_stack.append(current_document)
|
||||
|
||||
if doc_path == root_document.source_locator:
|
||||
continue
|
||||
if current_document.is_collection:
|
||||
for child in ET.fromstring(current_document.raw):
|
||||
path_match = child.find("{DAV:}href")
|
||||
path = (
|
||||
f"{self.base_url}{path_match.text}"
|
||||
if path_match is not None
|
||||
else ""
|
||||
)
|
||||
|
||||
full_document_path = f"{self.base_url}/{doc_path}"
|
||||
root_document.children.append(self.get_document(full_document_path))
|
||||
if path == current or not path:
|
||||
continue
|
||||
|
||||
return root_document
|
||||
fetch_queue.append((path, depth + 1))
|
||||
|
||||
else:
|
||||
doc_stack[-1].children.append(current_document)
|
||||
|
||||
return doc_stack[0]
|
||||
|
||||
def get_document(self, path: str) -> NextCloudDocument:
|
||||
p = pathlib.Path(path)
|
||||
is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)
|
||||
|
||||
with self._client as client:
|
||||
verb = "PROPFIND" if not p.suffix else "GET"
|
||||
verb = "PROPFIND" if not is_doc else "GET"
|
||||
request = httpx.Request(verb, path)
|
||||
response = client.send(request)
|
||||
|
||||
if p.suffix:
|
||||
if is_doc:
|
||||
return NextCloudDocument.doc(locator=path, raw=response.text)
|
||||
else:
|
||||
return NextCloudDocument.collection(locator=path)
|
||||
return NextCloudDocument.collection(locator=path, raw=response.text)
|
||||
|
|
Loading…
Reference in a new issue