refactor: iteratively crawl, fetch and build document tree
This commit is contained in:
parent
d0a7adaf5a
commit
db7ecf0ba2
2 changed files with 35 additions and 18 deletions
|
@ -24,6 +24,10 @@ class Document(pydantic.BaseModel, abc.ABC):
|
||||||
|
|
||||||
return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
|
return cls(**kwargs, source_locator=pathlib.Path(locator), type="document")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_collection(self) -> bool:
|
||||||
|
return self.type == "collection"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def metadata(self) -> dict[str, object]:
|
def metadata(self) -> dict[str, object]:
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import httpx
|
import httpx
|
||||||
import frontmatter
|
import frontmatter
|
||||||
|
|
||||||
|
import re
|
||||||
import pathlib
|
import pathlib
|
||||||
import functools
|
import functools
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
@ -57,39 +58,51 @@ class NextCloudClient(DocumentClient, arbitrary_types_allowed=True):
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_document_tree(self, root: str) -> NextCloudDocument:
|
def get_document_tree(self, root: str) -> NextCloudDocument:
|
||||||
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}"
|
root_locator = f"{self.base_url}/remote.php/dav/files/{self.user}/{root}/"
|
||||||
request = httpx.Request("PROPFIND", root_locator)
|
|
||||||
|
|
||||||
with self._client as client:
|
fetch_queue = [(root_locator, 0)]
|
||||||
response = client.send(request)
|
doc_stack = []
|
||||||
|
|
||||||
root_properties = ET.fromstring(response.text)
|
while fetch_queue:
|
||||||
|
current, depth = fetch_queue.pop()
|
||||||
|
|
||||||
root_document = NextCloudDocument.collection(locator=root_locator)
|
if len(doc_stack) > depth:
|
||||||
|
doc_stack.pop()
|
||||||
|
|
||||||
for r in root_properties:
|
current_document = self.get_document(current)
|
||||||
doc_path = r.find("{DAV:}href")
|
|
||||||
|
|
||||||
if doc_path is not None:
|
if not doc_stack or current_document.is_collection:
|
||||||
doc_path = doc_path.text
|
doc_stack.append(current_document)
|
||||||
|
|
||||||
if doc_path == root_document.source_locator:
|
if current_document.is_collection:
|
||||||
|
for child in ET.fromstring(current_document.raw):
|
||||||
|
path_match = child.find("{DAV:}href")
|
||||||
|
path = (
|
||||||
|
f"{self.base_url}{path_match.text}"
|
||||||
|
if path_match is not None
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
if path == current or not path:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
full_document_path = f"{self.base_url}/{doc_path}"
|
fetch_queue.append((path, depth + 1))
|
||||||
root_document.children.append(self.get_document(full_document_path))
|
|
||||||
|
|
||||||
return root_document
|
else:
|
||||||
|
doc_stack[-1].children.append(current_document)
|
||||||
|
|
||||||
|
return doc_stack[0]
|
||||||
|
|
||||||
def get_document(self, path: str) -> NextCloudDocument:
|
def get_document(self, path: str) -> NextCloudDocument:
|
||||||
p = pathlib.Path(path)
|
p = pathlib.Path(path)
|
||||||
|
is_doc = re.fullmatch(r"^\.[a-zA-Z]+$", p.suffix)
|
||||||
|
|
||||||
with self._client as client:
|
with self._client as client:
|
||||||
verb = "PROPFIND" if not p.suffix else "GET"
|
verb = "PROPFIND" if not is_doc else "GET"
|
||||||
request = httpx.Request(verb, path)
|
request = httpx.Request(verb, path)
|
||||||
response = client.send(request)
|
response = client.send(request)
|
||||||
|
|
||||||
if p.suffix:
|
if is_doc:
|
||||||
return NextCloudDocument.doc(locator=path, raw=response.text)
|
return NextCloudDocument.doc(locator=path, raw=response.text)
|
||||||
else:
|
else:
|
||||||
return NextCloudDocument.collection(locator=path)
|
return NextCloudDocument.collection(locator=path, raw=response.text)
|
||||||
|
|
Loading…
Reference in a new issue