chore: package reorg

test: cov
wip: (dirty) more curses
2020-10-02 23:00:38 -04:00 · 2020-10-01 18:41:27 -04:00 · 2020-09-28 20:26:17 -04:00 · 2020-09-28 19:41:48 -04:00 · 2020-09-28 17:17:03 -04:00 · 2020-09-28 16:46:10 -04:00
25 changed files with 1089 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
 *.sw[a-z]
 # C extensions
 *.so
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,21 @@
 appdirs==1.4.4
 attr==0.3.1
 attrs==20.2.0
 black==20.8b1
 click==7.1.2
 iniconfig==1.0.1
 invoke==1.4.1
 more-itertools==8.5.0
 mypy-extensions==0.4.3
 packaging==20.4
 pathspec==0.8.0
 pluggy==0.13.1
 py==1.9.0
 pyinotify==0.9.6
 pyparsing==2.4.7
 pytest==6.0.2
 regex==2020.7.14
 six==1.15.0
 toml==0.10.1
 typed-ast==1.4.1
 typing-extensions==3.7.4.3
--- a/script/bootstrap
+++ b/script/bootstrap
@ -0,0 +1,15 @@
 VENV=codesearch.venv
 #################################################################
 # Bootstrapping sets up the Python 3.8 venv that allows the use #
 # of the invoke commands.                                       #
 #################################################################
 {
    pyenv virtualenv-delete -f $VENV
    pyenv virtualenv $VENV &&
    pyenv activate $VENV &&
    python -m pip install -U pip &&
    pip install -r requirements.txt &&
    echo "✨ Good to go! ✨"
 }
--- a/src/init.py
+++ b/src/init.py
--- a/src/snapshots/test_prefix_tree.ambr
+++ b/src/snapshots/test_prefix_tree.ambr
@ -0,0 +1,184 @@
 # name: test_base_tree_has_a_root_node
  <class 'dict'> {
    'children': <class 'list'> [
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_insert_multiple_keys_same_string
  <class 'dict'> {
    'children': <class 'list'> [
      <class 'dict'> {
        'children': <class 'list'> [
          <class 'dict'> {
            'children': <class 'list'> [
              <class 'dict'> {
                'children': <class 'list'> [
                  <class 'dict'> {
                    'children': <class 'list'> [
                    ],
                    'mappings': <class 'list'> [
                      'key_1',
                      'key_2',
                    ],
                    'value': 'd',
                  },
                ],
                'mappings': <class 'list'> [
                ],
                'value': 'c',
              },
            ],
            'mappings': <class 'list'> [
            ],
            'value': 'b',
          },
        ],
        'mappings': <class 'list'> [
        ],
        'value': 'a',
      },
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_insert_overlapping_strings
  <class 'dict'> {
    'children': <class 'list'> [
      <class 'dict'> {
        'children': <class 'list'> [
          <class 'dict'> {
            'children': <class 'list'> [
              <class 'dict'> {
                'children': <class 'list'> [
                  <class 'dict'> {
                    'children': <class 'list'> [
                    ],
                    'mappings': <class 'list'> [
                      'key_1',
                    ],
                    'value': 'd',
                  },
                  <class 'dict'> {
                    'children': <class 'list'> [
                    ],
                    'mappings': <class 'list'> [
                      'key_2',
                    ],
                    'value': 'e',
                  },
                ],
                'mappings': <class 'list'> [
                ],
                'value': 'c',
              },
            ],
            'mappings': <class 'list'> [
            ],
            'value': 'b',
          },
        ],
        'mappings': <class 'list'> [
        ],
        'value': 'a',
      },
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_insert_single_character_
  <class 'dict'> {
    'children': <class 'list'> [
      <class 'dict'> {
        'children': <class 'list'> [
        ],
        'mappings': <class 'list'> [
          'key_1',
        ],
        'value': 'a',
      },
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_insert_single_string
  <class 'dict'> {
    'children': <class 'list'> [
      <class 'dict'> {
        'children': <class 'list'> [
          <class 'dict'> {
            'children': <class 'list'> [
              <class 'dict'> {
                'children': <class 'list'> [
                ],
                'mappings': <class 'list'> [
                  'key_1',
                ],
                'value': 'c',
              },
            ],
            'mappings': <class 'list'> [
            ],
            'value': 'b',
          },
        ],
        'mappings': <class 'list'> [
        ],
        'value': 'a',
      },
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_insert_strings_subsets_of_each_other
  <class 'dict'> {
    'children': <class 'list'> [
      <class 'dict'> {
        'children': <class 'list'> [
          <class 'dict'> {
            'children': <class 'list'> [
              <class 'dict'> {
                'children': <class 'list'> [
                  <class 'dict'> {
                    'children': <class 'list'> [
                    ],
                    'mappings': <class 'list'> [
                      'key_1',
                    ],
                    'value': 'd',
                  },
                ],
                'mappings': <class 'list'> [
                  'key_2',
                ],
                'value': 'c',
              },
            ],
            'mappings': <class 'list'> [
            ],
            'value': 'b',
          },
        ],
        'mappings': <class 'list'> [
        ],
        'value': 'a',
      },
    ],
    'mappings': <class 'list'> [
    ],
    'value': None,
  }
 ---
 # name: test_serializes_to_json
  '{"value": null, "mappings": [], "children": [{"value": "a", "mappings": [], "children": [{"value": "b", "mappings": [], "children": [{"value": "c", "mappings": [], "children": [{"value": "d", "mappings": ["key_1"], "children": []}]}]}]}]}'
 ---
--- a/src/codesearch/init.py
+++ b/src/codesearch/init.py
--- a/src/codesearch/base.py
+++ b/src/codesearch/base.py
@ -0,0 +1,18 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional
 class IndexBase(ABC):
    @abstractmethod
    def index(self, content: str, haystack: Optional[List[str]]):
        pass
    @abstractmethod
    def query(self, query: str) -> List[str]:
        pass
 class IndexerBase(ABC):
    @abstractmethod
    def index(self, paths: List[str]):
        pass
--- a/src/codesearch/cli.py
+++ b/src/codesearch/cli.py
@ -0,0 +1,30 @@
 import argparse
 from pathlib import Path
 from .server import Server
 from .indexer import Indexer
 from .client import search
 from .settings import settings
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("command")
    parser.add_argument("--q", required=False)
    args = parser.parse_args()
    if args.command == "start":
        watched = [Path(p).expanduser() for p in settings.WATCHED]
        server = Server(
            indexer=Indexer(
                domain=watched,
                exclusions=settings.EXCLUDES,
                file_types=settings.FILE_TYPES,
            ),
            watched=watched,
        )
        server.run()
    elif args.command == "search":
        search(args.q)
--- a/src/codesearch/client.py
+++ b/src/codesearch/client.py
@ -0,0 +1,83 @@
 import socket
 import json
 from pathlib import Path
 import curses
 from .settings import settings
 from .colors import highlight
 def display_handler(stdscr, buffer):
    current_y = 0
    stdscr.refresh()
    curses.start_color()
    y, x = stdscr.getmaxyx()
    curses.init_pair(1, curses.COLOR_GREEN, curses.COLOR_BLACK)
    pad = curses.newpad(y, x)
    while True:
        row = 0
        y_offset = 0
        pad.clear()
        while row < current_y + y - 1:
            l = buffer[current_y + y_offset]
            if l["type"] == "path":
                pad.addstr(row, 0, l["value"], curses.color_pair(1))
                row += 1
                y_offset += 1
            elif l["type"] == "sep":
                row += 1
                y_offset += 1
            else:
                pad.addstr(row, 0, str(l["lineno"]), curses.color_pair(1))
                pad.addstr(row, 5, l["value"])
                row += 1
                y_offset += 1
            if y_offset == y or current_y == y - 1:
                break
        pad.refresh(0, 0, 0, 0, y, x)
        key = stdscr.getch()
        if key in [81, 113]:
            break
        elif key == curses.KEY_UP:
            current_y = max(0, current_y - 1)
        elif key == curses.KEY_DOWN:
            current_y = min(len(buffer), current_y + 1)
        elif key == curses.KEY_NPAGE:
            current_y = min(len(buffer), current_y + y + 1)
        elif key == curses.KEY_PPAGE:
            current_y = max(0, current_y - y - 1)
 def search(query):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.connect((settings.SOCKET_HOST, settings.SOCKET_PORT))
        s.sendall(query.encode())
        length = int(s.recv(8).decode())
        results = None
        with open(Path(settings.BUFFER_PATH).expanduser(), "rb") as infile:
            results = infile.read().decode()
        results = json.loads(results)
        output = []
        for result in results:
            with open(result["key"], "r") as infile:
                highlighted_text = infile.read()[
                    result["offset_start"] : result["offset_end"]
                ]
                line_number = result["line_start"]
                output.append({"value": result["key"], "type": "path"})
                for l in highlighted_text.split("\n"):
                    output.append({"value": l, "type": "code", "lineno": line_number})
                    line_number += 1
                output.append({"type": "sep"})
        s.close()
        curses.wrapper(display_handler, output)
--- a/src/codesearch/colors.py
+++ b/src/codesearch/colors.py
@ -0,0 +1,7 @@
 COLORS = {"green": "\033[92m", "yellow": "\033[93m", "red": "\033[91m"}
 ENDC = "\033[0m"
 def highlight(text, color="green"):
    color_code = COLORS[color]
    return f"{color_code}{text}{ENDC}"
--- a/src/codesearch/constants.py
+++ b/src/codesearch/constants.py
@ -0,0 +1,11 @@
 SETTINGS_KEYS = [
    "WATCHED",
    "SOCKET_PORT",
    "SOCKET_HOST",
    "EXCLUDES",
    "FILE_TYPES",
    "SIGNIFICANCE_THRESHOLD",
    "INDEXING_PROCESSES",
    "BUFFER_PATH",
 ]
 QUERY_STRING_LENGTH = 1024
--- a/src/codesearch/document_models.py
+++ b/src/codesearch/document_models.py
@ -0,0 +1,52 @@
 import attr
@attr.s
 class Corpus:
    _documents = attr.ib(default=attr.Factory(dict))
    _key_to_uid = attr.ib(default=attr.Factory(dict))
    @property
    def document_count(self):
        return len(self._documents)
    def add_document(self, key, content):
        document_uid = f"document:{self.document_count}"
        self._documents[document_uid] = Document(
            uid=document_uid, key=key, content=content
        )
        self._key_to_uid[key] = document_uid
        return document_uid
    def get_document(self, uid=None, key=None):
        if key:
            uid = self._key_to_uid[key]
        return self._documents[uid]
    def collect_unprocessed_documents(self):
        return [
            uid
            for uid in self._documents
            if not self.get_document(uid=uid).is_processed
        ]
    def mark_document_as_processed(self, uid):
        self._documents[uid].mark_as_processed()
@attr.s
 class Document:
    uid = attr.ib()
    key = attr.ib()
    content = attr.ib()
    _processed = attr.ib(default=False)
    @property
    def is_processed(self):
        return self._processed
    def mark_as_processed(self):
        self._processed = True
--- a/src/codesearch/indexer.py
+++ b/src/codesearch/indexer.py
@ -0,0 +1,211 @@
 from .base import IndexerBase
 from pathlib import Path
 from typing import Dict, List
 import re
 from time import perf_counter
 from multiprocessing import Pool
 import mmap
 import attr
 from .settings import settings
 from .process_utils import chunkify_content
 from .document_models import Corpus
 from .trigram_index import TrigramIndex
 from .line_index import LineIndex
 from .logger import get_logger
 logger = get_logger(__name__)
@attr.s
 class SearchResult:
    key = attr.ib()
    offset_start = attr.ib()
    offset_end = attr.ib()
    line_start = attr.ib()
    line_end = attr.ib()
    def to_dict(self):
        return {
            "key": self.key,
            "offset_start": self.offset_start,
            "offset_end": self.offset_end,
            "line_start": self.line_start,
            "line_end": self.line_end,
        }
@attr.s
 class Indexer(IndexerBase):
    # Indices
    _trigram_index = attr.ib(default=attr.Factory(TrigramIndex))
    _line_index = attr.ib(default=attr.Factory(LineIndex))
    _exclusions = attr.ib(default=attr.Factory(list))
    _file_types = attr.ib(default=attr.Factory(list))
    # Document corpus
    corpus = attr.ib(default=attr.Factory(Corpus))
    domain = attr.ib(default=attr.Factory(list))
    def index(self, paths: List[str]):
        start_time = perf_counter()
        discovered = []
        for path in paths:
            discovered.extend(self._discover(path))
        logger.info(f"Discovered {len(discovered)} files.", prefix="Discovery")
        self._build_corpus(discovered)
        self._populate_indices(self.corpus.collect_unprocessed_documents())
        end_time = perf_counter()
        logger.info(
            f"{self.corpus.document_count} total files indexed in {end_time - start_time} seconds.",
            prefix="Index status",
        )
    def query(self, query: str):
        start_time = perf_counter()
        leads = self._trigram_index.query(query)
        logger.info(
            f"Narrowed down to {len(leads)} files via trigram search", prefix="Query"
        )
        confirmed = []
        uniques = 0
        for lead in leads:
            uid, score = lead
            lead_path = self.corpus.get_document(uid=uid).key
            lead_content = ""
            try:
                with open(lead_path, "r") as infile:
                    import mmap
                    m = mmap.mmap(infile.fileno(), 0, prot=mmap.PROT_READ)
                    lead_content = m.read().decode()
            except Exception as e:
                logger.warning(e)
                logger.warning(f"No content in {lead_path}", prefix="Query")
            results = re.finditer(query, lead_content)
            hits_in_lead = []
            for hit in results:
                start_line, end_line = self._find_line_range(
                    lead_path, hit.start(), hit.end()
                )
                start_offset = self._line_index.query(lead_path)[start_line][0]
                end_offset = self._line_index.query(lead_path)[end_line][1]
                hits_in_lead.append(
                    SearchResult(
                        key=lead_path,
                        offset_start=start_offset,
                        offset_end=end_offset,
                        line_start=start_line,
                        line_end=end_line,
                    )
                )
            if hits_in_lead:
                confirmed.extend(hits_in_lead)
                uniques += 1
        end_time = perf_counter()
        logger.info(
            f"{len(confirmed)} hits in {uniques} files ({end_time - start_time} seconds elapsed).",
            prefix="Query",
        )
        return [r.to_dict() for r in confirmed]
    def _discover(self, path_root: str) -> Dict[str, str]:
        collected = []
        current = Path(path_root)
        # Avoid any excluded paths
        if any([current.match(x) for x in self._exclusions]):
            logger.info(f"{path_root} excluded.", prefix="Discovery")
            return []
        if current.is_dir():
            for child_path in current.iterdir():
                collected.extend(self._discover(str(child_path)))
            return collected
        if current.suffix not in self._file_types:
            return []
        logger.info(f"Collected {path_root}", prefix="Discovery")
        return [path_root]
    def _build_corpus(self, discovered: List[str]):
        total = len(discovered)
        current = 0
        for discovered_file in discovered:
            self.corpus.add_document(key=discovered_file, content="")
            current += 1
            logger.info(
                f"({current}/{total}) Registered {discovered_file} in corpus",
                prefix="Corpus building",
            )
    def _populate_indices(self, uids):
        processes = settings.INDEXING_PROCESSES
        pool = Pool(processes=processes)
        chunks = chunkify_content(uids, processes)
        processed_chunks = pool.map(self._bulk_process, chunks)
        for result in processed_chunks:
            for uid in result[0]:
                self._trigram_index.index(
                    uid.replace("document:", ""), None, None, result[0][uid]
                )
            self._line_index._lines.update(result[1])
    # TODO: Tidy up, rethink w.r.t. multiprocessing.
    def _bulk_process(self, uids: List[str]):
        trigrams = {}
        total = len(uids)
        current = 0
        for uid in uids:
            document = self.corpus.get_document(uid=uid)
            path = document.key
            try:
                with open(path, "r") as document_file:
                    mapped_file = mmap.mmap(
                        document_file.fileno(), 0, prot=mmap.PROT_READ
                    )
                    content = mapped_file.read().decode()
                    trigrams[uid] = TrigramIndex.trigramize(content)
                    self._line_index.index(path, content)
                    current += 1
                    logger.info(
                        f"({current}/{total}) Processed {path}", prefix="Indexing"
                    )
            except Exception as e:
                logger.info(e)
                current += 1
                logger.warning(
                    f"({current}/{total}) Could not read {path}, skipping.",
                    prefix="Indexing",
                )
        return (trigrams, self._line_index._lines)
    def _find_closest_line(self, path, index):
        content = self._line_index.query(path)
        for l in content:
            if content[l][0] <= index <= content[l][1]:
                return l
        # TODO: This should not be reachable.
        return 0
    def _find_line_range(self, key, start, end, padding=5):
        start_line = self._find_closest_line(key, start)
        end_line = self._find_closest_line(key, end)
        start_line_range = max(0, start_line - 5)
        end_line_range = min(len(self._line_index.query(key)) - 1, end_line + 5)
        return (start_line_range, end_line_range)
--- a/src/codesearch/line_index.py
+++ b/src/codesearch/line_index.py
@ -0,0 +1,22 @@
 from .base import IndexBase
 import attr
 from .logger import get_logger
 logger = get_logger(__name__)
@attr.s
 class LineIndex(IndexBase):
    _lines = attr.ib(default=attr.Factory(dict))
    def index(self, key: str, content: str):
        self._lines[key] = {}
        current, count = 0, 0
        for line in content.split("\n"):
            self._lines[key][count] = (current, current + len(line))
            current += len(line)
            count += 1
    def query(self, key: str):
        return self._lines[key]
--- a/src/codesearch/logger.py
+++ b/src/codesearch/logger.py
@ -0,0 +1,35 @@
 import logging
 import sys
 import attr
 from .colors import highlight
@attr.s
 class Logger:
    logger = attr.ib()
    def info(self, message, prefix=None):
        prefix_str = ""
        if prefix:
            prefix_str = highlight(f"[{prefix}]", "green")
        self.logger.info(f"{prefix_str} {message}")
    def warning(self, message, prefix=None):
        prefix_str = ""
        if prefix:
            prefix_str = highlight(f"[{prefix}]", "yellow")
        self.logger.warning(f"{prefix_str} {message}")
 def get_logger(name):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    handler = logging.StreamHandler(sys.stdout)
    logger.addHandler(handler)
    logger_obj = Logger(logger=logger)
    return logger_obj
--- a/src/codesearch/prefix_tree.py
+++ b/src/codesearch/prefix_tree.py
@ -0,0 +1,66 @@
 import json
 import attr
@attr.s
 class PrefixTree:
    root = attr.ib()
    @staticmethod
    def initialize():
        root = PrefixTreeNode(value=None)
        return PrefixTree(root=root)
    def insert(self, value, key, current=None):
        if current is None:
            current = self.root
        if not value:
            current.mappings.append(key)
            return
        top = value[0]
        rest = value[1:]
        next_child = current.children.get(top)
        if next_child:
            self.insert(rest, key, next_child)
        else:
            new_node = PrefixTreeNode(value=top)
            current.children[top] = new_node
            self.insert(rest, key, new_node)
    def get(self, value, current=None):
        if not current:
            current = self.root
        if not value:
            return current.mappings
        top = value[0]
        rest = value[1:]
        next_child = current.children.get(top)
        if next_child:
            return self.get(rest, next_child)
    def to_dict(self):
        return self.root.to_dict()
    def to_json(self):
        return json.dumps(self.to_dict())
@attr.s
 class PrefixTreeNode:
    value = attr.ib()
    mappings = attr.ib(default=attr.Factory(list))
    children = attr.ib(default=attr.Factory(dict))
    def to_dict(self):
        return {
            "value": self.value,
            "mappings": self.mappings,
            "children": [child.to_dict() for child in self.children.values()],
        }
--- a/src/codesearch/process_utils.py
+++ b/src/codesearch/process_utils.py
@ -0,0 +1,15 @@
 def chunkify_content(content, chunk_count, chunk_size=None):
    if chunk_size is None:
        chunk_size = int(len(content) / chunk_count)
    chunks = []
    last_boundary = 0
    for i in range(chunk_count):
        if i == chunk_count - 1:
            chunks.append(content[last_boundary:])
        else:
            chunks.append(content[last_boundary : last_boundary + chunk_size])
        last_boundary += chunk_size
    return chunks
--- a/src/codesearch/server.py
+++ b/src/codesearch/server.py
@ -0,0 +1,81 @@
 import json
 import socket
 import pyinotify
 import attr
 from codesearch.watcher import WatchHandler
 from codesearch.indexer import Indexer
 from codesearch.constants import QUERY_STRING_LENGTH
 from pathlib import Path
 from codesearch.settings import settings
 from codesearch.logger import get_logger
 logger = get_logger(__name__)
@attr.s
 class Server:
    indexer = attr.ib()
    watched = attr.ib()
    _notifier = attr.ib(default=None)
    _socket = attr.ib(default=None)
    def _handle_socket(self, *, socket):
        socket.bind((settings.SOCKET_HOST, settings.SOCKET_PORT))
        socket.listen()
        logger.info(
            f"Listening on {settings.SOCKET_HOST}:{settings.SOCKET_PORT}",
            prefix="Server",
        )
        while True:
            conn, _ = socket.accept()
            query_string = conn.recv(QUERY_STRING_LENGTH).decode()
            logger.info(f"Query string: {query_string}", prefix="Query")
            if query_string:
                try:
                    query_results = self.indexer.query(query_string)
                    response = json.dumps(query_results).encode()
                    response_length = str(len(response.decode()))
                    with open(Path(settings.BUFFER_PATH).expanduser(), "wb") as outfile:
                        outfile.write(response)
                    conn.sendall(response_length.encode())
                except KeyboardInterrupt:
                    raise e
                except Exception as e:
                    logger.warning(e)
                    pass
    def _start_socket(self):
        try:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as socket_obj:
                self._socket = socket_obj
                self._handle_socket(socket=socket_obj)
        except Exception as e:
            logger.warning(e)
            raise e
    def _start_watch(self):
        watch_manager = pyinotify.WatchManager()
        for path in self.watched:
            logger.info(f"Watching {path}", prefix="Server")
            watch_manager.add_watch(path, pyinotify.ALL_EVENTS, rec=True)
        event_handler = WatchHandler(indexer=self.indexer)
        notifier = pyinotify.ThreadedNotifier(watch_manager, event_handler)
        notifier.start()
        self._notifier = notifier
    def run(self):
        collected = {}
        self.indexer.index(self.watched)
        try:
            self._start_watch()
            self._start_socket()
        except:
            self._socket.close()
            self._notifier.stop()
--- a/src/codesearch/settings.py
+++ b/src/codesearch/settings.py
@ -0,0 +1,45 @@
 import json
 from pathlib import Path
 import attr
 from .constants import SETTINGS_KEYS
 SETTINGS_PATH = "~/.codesearchrc"
 default_settings = {
    "SOCKET_HOST": "127.0.0.1",
    "SOCKET_PORT": 65126,
    "EXCLUDES": [],
    "FILE_TYPES": [],
    "SIGNIFICANCE_THRESHOLD": 0,
    "WATCHED": [],
    "INDEXING_PROCESSES": 4,
    "BUFFER_PATH": "~/.codesearchbuffer",
 }
@attr.s
 class Settings:
    settings = attr.ib(default=attr.Factory(dict))
    def from_file(self, path: str):
        settings_path = Path(SETTINGS_PATH).expanduser()
        if not settings_path.exists():
            self.settings = default_settings
            return
        with open(path, "r") as settings_file:
            self.settings = json.load(settings_file)
    def __getattr__(self, key):
        if key not in SETTINGS_KEYS:
            raise KeyError(f"{key} not a valid settings property")
        return self.settings[key]
 settings = Settings()
 settings.from_file(Path(SETTINGS_PATH).expanduser())
--- a/src/codesearch/test_indexer.py
+++ b/src/codesearch/test_indexer.py
@ -0,0 +1,49 @@
 import pytest
 from .indexer import Indexer
@pytest.fixture()
 def indexer():
    return Indexer()
 def test_indexer_builds_trigram_set_for_given_document(indexer):
    mock_document = "now that's a doc"
    mock_path = "/home/documents/cool_doc"
    indexer.index(path=mock_path, content=mock_document)
    expected_trigrams = [
        "now",
        "ow ",
        "w t",
        " th",
        "tha",
        "hat",
        "at'",
        "t's",
        "'s ",
        "s a",
        " a ",
        "a d",
        " do",
        "doc",
    ]
    assert indexer.trigrams == {mock_path: set(expected_trigrams)}
 def test_indexer_preserves_previous_trigram_sets_on_index(indexer):
    mock_document_1 = "wow"
    mock_document_2 = "woa"
    mock_path_1 = "/home"
    mock_path_2 = "/somewhere_else"
    indexer.index(path=mock_path_1, content=mock_document_1)
    assert indexer.trigrams == {mock_path_1: set(["wow"])}
    indexer.index(path=mock_path_2, content=mock_document_2)
    assert indexer.trigrams == {mock_path_1: set(["wow"]), mock_path_2: set(["woa"])}
--- a/src/codesearch/test_prefix_tree.py
+++ b/src/codesearch/test_prefix_tree.py
@ -0,0 +1,67 @@
 import pytest
 from .prefix_tree import PrefixTree
@pytest.fixture
 def prefix_tree():
    return PrefixTree.initialize()
 def test_base_tree_has_a_root_node(prefix_tree, snapshot):
    assert prefix_tree.to_dict() == snapshot
 def test_insert_single_string(prefix_tree, snapshot):
    mock_value = "abc"
    mock_key = "key_1"
    prefix_tree.insert(value=mock_value, key=mock_key)
    assert prefix_tree.to_dict() == snapshot
    assert prefix_tree.get(value=mock_value) == [mock_key]
 def test_insert_single_character_(prefix_tree, snapshot):
    mock_value = "a"
    mock_key = "key_1"
    prefix_tree.insert(value=mock_value, key=mock_key)
    assert prefix_tree.to_dict() == snapshot
    assert prefix_tree.get(value=mock_value) == [mock_key]
 def test_insert_overlapping_strings(prefix_tree, snapshot):
    mock_value_1 = "abcd"
    mock_key_1 = "key_1"
    mock_value_2 = "abce"
    mock_key_2 = "key_2"
    prefix_tree.insert(value=mock_value_1, key=mock_key_1)
    prefix_tree.insert(value=mock_value_2, key=mock_key_2)
    assert prefix_tree.to_dict() == snapshot
    assert prefix_tree.get(value=mock_value_1) == [mock_key_1]
    assert prefix_tree.get(value=mock_value_2) == [mock_key_2]
 def test_insert_multiple_keys_same_string(prefix_tree, snapshot):
    mock_value = "abcd"
    mock_key_1 = "key_1"
    mock_key_2 = "key_2"
    prefix_tree.insert(value=mock_value, key=mock_key_1)
    prefix_tree.insert(value=mock_value, key=mock_key_2)
    assert prefix_tree.to_dict() == snapshot
    assert prefix_tree.get(value=mock_value) == [mock_key_1, mock_key_2]
 def test_insert_strings_subsets_of_each_other(prefix_tree, snapshot):
    mock_value_1 = "abcd"
    mock_key_1 = "key_1"
    mock_value_2 = "abc"
    mock_key_2 = "key_2"
    prefix_tree.insert(value=mock_value_1, key=mock_key_1)
    prefix_tree.insert(value=mock_value_2, key=mock_key_2)
    assert prefix_tree.to_dict() == snapshot
    assert prefix_tree.get(value=mock_value_1) == [mock_key_1]
    assert prefix_tree.get(value=mock_value_2) == [mock_key_2]
 def test_serializes_to_json(prefix_tree, snapshot):
    prefix_tree.insert(value="abcd", key="key_1")
    assert prefix_tree.to_json() == snapshot
--- a/src/codesearch/trigram_index.py
+++ b/src/codesearch/trigram_index.py
@ -0,0 +1,47 @@
 from typing import List, Optional
 import attr
 from .settings import settings
 from .base import IndexBase
 from .prefix_tree import PrefixTree
@attr.s
 class TrigramIndex(IndexBase):
    _threshold = attr.ib(default=settings.SIGNIFICANCE_THRESHOLD)
    _tree = attr.ib(attr.Factory(PrefixTree.initialize))
    def index(self, uid, key: str, content: str, trigrams):
        if content:
            trigrams = TrigramIndex.trigramize(content)
        for trigram in trigrams:
            self._tree.insert(trigram, uid)
    def query(self, query: str, haystack: Optional[List[str]] = None) -> List[str]:
        query_trigrams = TrigramIndex.trigramize(query)
        results = {}
        for trigram in query_trigrams:
            result_set = self._tree.get(trigram)
            if result_set:
                results[trigram] = result_set
        matches = {}
        for result in results:
            for doc in results[result]:
                matches[doc] = matches.get(doc, 0) + 1
        significant_results = []
        for uid, occurrences in matches.items():
            score = occurrences / len(query_trigrams)
            if score >= self._threshold:
                significant_results.append((f"document:{uid}", score))
        significant_results.sort(reverse=True, key=lambda x: x[0])
        return significant_results
    @staticmethod
    def trigramize(content: str) -> List[str]:
        return {content[pos : pos + 3].lower() for pos in range(len(content) - 2)}
--- a/src/codesearch/watcher.py
+++ b/src/codesearch/watcher.py
@ -0,0 +1,14 @@
 import pyinotify
 import attr
 from .logger import get_logger
 logger = get_logger(__name__)
@attr.s
 class WatchHandler(pyinotify.ProcessEvent):
    indexer = attr.ib()
    def process_IN_MODIFY(self, event):
        self.indexer.index([event.pathname])
--- a/src/setup.py
+++ b/src/setup.py
@ -0,0 +1,9 @@
 from setuptools import setup
 setup(
    name="codesearch",
    version="0.1",
    packages=["codesearch"],
    install_requires=["pyinotify", "attr"],
    entry_points={"console_scripts": ["codesearch=codesearch.cli:main"]},
 )
--- a/tasks.py
+++ b/tasks.py
@ -0,0 +1,6 @@
 from invoke import task
@task
 def lint(ctx):
    ctx.run("black *.py src")
Author	SHA1	Message	Date
Marc Cataford	11799dcf27	chore: package reorg	2020-10-02 23:00:38 -04:00
Marc Cataford	db7146bb31	test: cov	2020-10-01 18:41:27 -04:00
Marc Cataford	fa66296ca8	wip: (dirty) more curses	2020-09-28 20:26:17 -04:00
Marc Cataford	02d4830be6	wip: (dirty) curses cli	2020-09-28 19:41:48 -04:00
Marc Cataford	26d40bd739	refactor: tidy	2020-09-28 17:17:03 -04:00
Marc Cataford	b9841084a1	wip: (dirty) buffer instead of large socket msg	2020-09-28 16:46:10 -04:00
Marc Cataford	806279edaa	wip: rejig colors, logging	2020-09-28 15:22:18 -04:00
Marc Cataford	50fac496ea	wip: prefix tree	2020-09-28 15:05:18 -04:00
Marc Cataford	e82bc7490b	wip: tidy up threaded indexing	2020-09-28 11:12:56 -04:00
Marc Cataford	eaf669aa85	wip: (dirty) split indexing in processes	2020-09-28 10:46:38 -04:00
Marc Cataford	4db9d43b02	refactor: corpus, document model	2020-09-28 09:39:45 -04:00
Marc Cataford	bd349d8338	chore: extraneous $	2020-09-28 09:39:30 -04:00
Marc Cataford	ddbed30d06	refactor: minor	2020-09-27 12:41:07 -04:00
Marc Cataford	374685ae09	refactor: indices, settings	2020-09-27 12:30:26 -04:00
Marc Cataford	59795a5dec	wip: functional version	2020-09-27 00:11:08 -04:00