wip: tidy up threaded indexing
This commit is contained in:
parent
eaf669aa85
commit
e82bc7490b
4 changed files with 34 additions and 33 deletions
|
@ -5,5 +5,6 @@ SETTINGS_KEYS = [
|
|||
"EXCLUDES",
|
||||
"FILE_TYPES",
|
||||
"SIGNIFICANCE_THRESHOLD",
|
||||
"INDEXING_PROCESSES",
|
||||
]
|
||||
QUERY_STRING_LENGTH = 1024
|
||||
|
|
|
@ -3,12 +3,13 @@ from pathlib import Path
|
|||
from typing import Dict, List
|
||||
import re
|
||||
from time import perf_counter
|
||||
from multiprocessing import Pool, Process, Manager
|
||||
from multiprocessing import Pool
|
||||
|
||||
import attr
|
||||
|
||||
from settings import settings
|
||||
|
||||
from process_utils import chunkify_content
|
||||
from document_models import Corpus
|
||||
from trigram_index import TrigramIndex
|
||||
from line_index import LineIndex
|
||||
|
@ -54,7 +55,7 @@ class Indexer(IndexerBase):
|
|||
logger.info(f"[Discovery] Discovered {len(discovered)} files.")
|
||||
|
||||
self._preload(discovered)
|
||||
self._async_process(self.corpus.collect_unprocessed_documents())
|
||||
self._populate_indices(self.corpus.collect_unprocessed_documents())
|
||||
end_time = perf_counter()
|
||||
|
||||
logger.info(
|
||||
|
@ -131,44 +132,27 @@ class Indexer(IndexerBase):
|
|||
logger.exception(e)
|
||||
logger.error(f"Could not read {discovered_file}, skipping.")
|
||||
|
||||
# TODO: Tidy up.
|
||||
def _async_process(self, uids):
|
||||
processes = 4 # Settings?
|
||||
chunk_size = int(len(uids) / processes) # Extract into process utils
|
||||
chunk_boundary = 0
|
||||
def _populate_indices(self, uids):
|
||||
processes = settings.INDEXING_PROCESSES
|
||||
pool = Pool(processes=processes)
|
||||
|
||||
chunks = []
|
||||
|
||||
for i in range(processes):
|
||||
if i == processes - 1:
|
||||
chunks.append(uids[chunk_boundary:])
|
||||
else:
|
||||
chunks.append(uids[chunk_boundary : chunk_boundary + chunk_size])
|
||||
|
||||
chunk_boundary += chunk_size
|
||||
|
||||
r = pool.map(self._bulk_process, chunks) # is pool the best way?
|
||||
for rs in r:
|
||||
self._trigram_index._trigrams.update(rs[0])
|
||||
self._line_index._lines.update(rs[1])
|
||||
chunks = chunkify_content(uids, processes)
|
||||
results = pool.map(self._bulk_process, chunks)
|
||||
# TODO: Refactor indices to populate cleanly.
|
||||
for result in results:
|
||||
self._trigram_index._trigrams.update(result[0])
|
||||
self._line_index._lines.update(result[1])
|
||||
|
||||
def _bulk_process(self, uids: List[str]):
|
||||
for uid in uids:
|
||||
self._process(uid)
|
||||
document = self.corpus.get_document(uid=uid)
|
||||
path = document.key
|
||||
content = document.content
|
||||
self._trigram_index.index(path, content)
|
||||
self._line_index.index(path, content)
|
||||
logger.info(f"[Indexing] Processed {path}")
|
||||
|
||||
return (self._trigram_index._trigrams, self._line_index._lines)
|
||||
|
||||
def _process(self, uid: str):
|
||||
document = self.corpus.get_document(uid=uid)
|
||||
path = document.key
|
||||
content = document.content
|
||||
|
||||
self._trigram_index.index(path, content)
|
||||
|
||||
self._line_index.index(path, content)
|
||||
logger.info(f"[Indexing] Processed {path}")
|
||||
|
||||
def _find_closest_line(self, path, index):
|
||||
content = self._line_index.query(path)
|
||||
|
||||
|
|
15
src/process_utils.py
Normal file
15
src/process_utils.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
def chunkify_content(content, chunk_count, chunk_size=None):
|
||||
if chunk_size is None:
|
||||
chunk_size = int(len(content) / chunk_count)
|
||||
chunks = []
|
||||
last_boundary = 0
|
||||
|
||||
for i in range(chunk_count):
|
||||
if i == chunk_count - 1:
|
||||
chunks.append(content[last_boundary:])
|
||||
else:
|
||||
chunks.append(content[last_boundary : last_boundary + chunk_size])
|
||||
|
||||
last_boundary += chunk_size
|
||||
|
||||
return chunks
|
|
@ -14,6 +14,7 @@ default_settings = {
|
|||
"FILE_TYPES": [],
|
||||
"SIGNIFICANCE_THRESHOLD": 0,
|
||||
"WATCHED": [],
|
||||
"INDEXING_PROCESSES": 4,
|
||||
}
|
||||
|
||||
|
||||
|
|
Reference in a new issue