infi.clickhouse_orm/examples/full_text_search/search.py
2020-06-26 17:53:39 +03:00

91 lines
2.9 KiB
Python

import sys
from colorama import init, Fore, Back, Style
from nltk.stem.porter import PorterStemmer
from infi.clickhouse_orm import Database, F
from models import Fragment
from load import trim_punctuation
# The wildcard character
WILDCARD = '*'
def prepare_search_terms(text):
'''
Convert the text to search into a list of stemmed words.
'''
stemmer = PorterStemmer()
stems = []
for word in text.split():
if word == WILDCARD:
stems.append(WILDCARD)
else:
stems.append(stemmer.stem(trim_punctuation(word)))
return stems
def build_query(db, stems):
'''
Returns a queryset instance for finding sequences of Fragment instances
that matche the list of stemmed words.
'''
# Start by searching for the first stemmed word
all_fragments = Fragment.objects_in(db)
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
# Add the following words to the queryset
for i, stem in enumerate(stems):
# Skip the first word (it's already in the query), and wildcards
if i == 0 or stem == WILDCARD:
continue
# Create a subquery that finds instances of the i'th word
subquery = all_fragments.filter(stem=stem).only(Fragment.document, Fragment.idx)
# Add it to the query, requiring that it will appear i places away from the first word
query = query.filter(F.isIn((Fragment.document, Fragment.idx + i), subquery))
# Sort the results
query = query.order_by(Fragment.document, Fragment.idx)
return query
def get_matching_text(db, document, from_idx, to_idx, extra=5):
'''
Reconstructs the document text between the given indexes (inclusive),
plus `extra` words before and after the match. The words that are
included in the given range are highlighted in green.
'''
text = []
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
word = fragment.word
if fragment.idx == from_idx:
word = Fore.GREEN + word
if fragment.idx == to_idx:
word = word + Style.RESET_ALL
text.append(word)
return ' '.join(text)
def find(db, text):
'''
Performs the search for the given text, and prints out the matches.
'''
stems = prepare_search_terms(text)
query = build_query(db, stems)
print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
for match in query:
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
if __name__ == '__main__':
# Initialize colored output
init()
# Initialize database
db = Database('default')
# Search
text = ' '.join(sys.argv[1:])
if text:
find(db, text)