mirror of
https://github.com/Infinidat/infi.clickhouse_orm.git
synced 2024-11-24 09:53:43 +03:00
62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
import sys
|
|
import nltk
|
|
from nltk.stem.porter import PorterStemmer
|
|
from glob import glob
|
|
from infi.clickhouse_orm import Database
|
|
from models import Fragment
|
|
|
|
|
|
def trim_punctuation(word):
|
|
'''
|
|
Trim punctuation characters from the beginning and end of the word
|
|
'''
|
|
start = end = len(word)
|
|
for i in range(len(word)):
|
|
if word[i].isalnum():
|
|
start = min(start, i)
|
|
end = i + 1
|
|
return word[start : end]
|
|
|
|
|
|
def parse_file(filename):
|
|
'''
|
|
Parses a text file at the give path.
|
|
Returns a generator of tuples (original_word, stemmed_word)
|
|
The original_word may include punctuation characters.
|
|
'''
|
|
stemmer = PorterStemmer()
|
|
with open(filename, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
for word in line.split():
|
|
yield (word, stemmer.stem(trim_punctuation(word)))
|
|
|
|
|
|
def get_fragments(filename):
|
|
'''
|
|
Converts a text file at the given path to a generator
|
|
of Fragment instances.
|
|
'''
|
|
from os import path
|
|
document = path.splitext(path.basename(filename))[0]
|
|
idx = 0
|
|
for word, stem in parse_file(filename):
|
|
idx += 1
|
|
yield Fragment(document=document, idx=idx, word=word, stem=stem)
|
|
print('{} - {} words'.format(filename, idx))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# Load NLTK data if necessary
|
|
nltk.download('punkt')
|
|
nltk.download('wordnet')
|
|
|
|
# Initialize database
|
|
db = Database('default')
|
|
db.create_table(Fragment)
|
|
|
|
# Load files from the command line or everything under ebooks/
|
|
filenames = sys.argv[1:] or glob('ebooks/*.txt')
|
|
for filename in filenames:
|
|
db.insert(get_fragments(filename), batch_size=100000)
|