infi.clickhouse_orm/examples/full_text_search/load.py

62 lines
1.7 KiB
Python
Raw Normal View History

2020-06-26 17:53:39 +03:00
import sys
import nltk
from nltk.stem.porter import PorterStemmer
from glob import glob
from infi.clickhouse_orm import Database
from models import Fragment
def trim_punctuation(word):
'''
Trim punctuation characters from the beginning and end of the word
'''
start = end = len(word)
for i in range(len(word)):
if word[i].isalnum():
start = min(start, i)
end = i + 1
return word[start : end]
def parse_file(filename):
'''
Parses a text file at the give path.
Returns a generator of tuples (original_word, stemmed_word)
The original_word may include punctuation characters.
'''
stemmer = PorterStemmer()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
for word in line.split():
yield (word, stemmer.stem(trim_punctuation(word)))
def get_fragments(filename):
'''
Converts a text file at the given path to a generator
of Fragment instances.
'''
from os import path
document = path.splitext(path.basename(filename))[0]
idx = 0
for word, stem in parse_file(filename):
idx += 1
yield Fragment(document=document, idx=idx, word=word, stem=stem)
print('{} - {} words'.format(filename, idx))
if __name__ == '__main__':
# Load NLTK data if necessary
nltk.download('punkt')
nltk.download('wordnet')
# Initialize database
db = Database('default')
db.create_table(Fragment)
# Load files from the command line or everything under ebooks/
filenames = sys.argv[1:] or glob('ebooks/*.txt')
for filename in filenames:
db.insert(get_fragments(filename), batch_size=100000)