mirror of
https://github.com/Infinidat/infi.clickhouse_orm.git
synced 2025-08-02 19:20:14 +03:00
Chore: fix linting for examples
This commit is contained in:
parent
0e9dea5bcb
commit
ce79b39407
|
@ -1,20 +1,25 @@
|
||||||
import psutil, time, datetime
|
import datetime
|
||||||
from clickhouse_orm import Database
|
import time
|
||||||
|
|
||||||
|
import psutil
|
||||||
from models import CPUStats
|
from models import CPUStats
|
||||||
|
|
||||||
|
from clickhouse_orm import Database
|
||||||
|
|
||||||
db = Database('demo')
|
db = Database("demo")
|
||||||
db.create_table(CPUStats)
|
db.create_table(CPUStats)
|
||||||
|
|
||||||
|
|
||||||
psutil.cpu_percent(percpu=True) # first sample should be discarded
|
psutil.cpu_percent(percpu=True) # first sample should be discarded
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
stats = psutil.cpu_percent(percpu=True)
|
stats = psutil.cpu_percent(percpu=True)
|
||||||
timestamp = datetime.datetime.now()
|
timestamp = datetime.datetime.now()
|
||||||
print(timestamp)
|
print(timestamp)
|
||||||
db.insert([
|
db.insert(
|
||||||
CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
|
[
|
||||||
for cpu_id, cpu_percent in enumerate(stats)
|
CPUStats(timestamp=timestamp, cpu_id=cpu_id, cpu_percent=cpu_percent)
|
||||||
])
|
for cpu_id, cpu_percent in enumerate(stats)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from clickhouse_orm import Model, DateTimeField, UInt16Field, Float32Field, Memory
|
from clickhouse_orm import DateTimeField, Float32Field, Memory, Model, UInt16Field
|
||||||
|
|
||||||
|
|
||||||
class CPUStats(Model):
|
class CPUStats(Model):
|
||||||
|
@ -8,4 +8,3 @@ class CPUStats(Model):
|
||||||
cpu_percent = Float32Field()
|
cpu_percent = Float32Field()
|
||||||
|
|
||||||
engine = Memory()
|
engine = Memory()
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
from clickhouse_orm import Database, F
|
|
||||||
from models import CPUStats
|
from models import CPUStats
|
||||||
|
|
||||||
|
from clickhouse_orm import Database, F
|
||||||
|
|
||||||
db = Database('demo')
|
db = Database("demo")
|
||||||
queryset = CPUStats.objects_in(db)
|
queryset = CPUStats.objects_in(db)
|
||||||
total = queryset.filter(CPUStats.cpu_id == 1).count()
|
total = queryset.filter(CPUStats.cpu_id == 1).count()
|
||||||
busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
|
busy = queryset.filter(CPUStats.cpu_id == 1, CPUStats.cpu_percent > 95).count()
|
||||||
print('CPU 1 was busy {:.2f}% of the time'.format(busy * 100.0 / total))
|
print("CPU 1 was busy {:.2f}% of the time".format(busy * 100.0 / total))
|
||||||
|
|
||||||
# Calculate the average usage per CPU
|
# Calculate the average usage per CPU
|
||||||
for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
|
for row in queryset.aggregate(CPUStats.cpu_id, average=F.avg(CPUStats.cpu_percent)):
|
||||||
print('CPU {row.cpu_id}: {row.average:.2f}%'.format(row=row))
|
print("CPU {row.cpu_id}: {row.average:.2f}%".format(row=row))
|
||||||
|
|
|
@ -1,62 +1,73 @@
|
||||||
import pygal
|
import pygal
|
||||||
from pygal.style import RotateStyle
|
|
||||||
from jinja2.filters import do_filesizeformat
|
from jinja2.filters import do_filesizeformat
|
||||||
|
from pygal.style import RotateStyle
|
||||||
|
|
||||||
|
|
||||||
# Formatting functions
|
# Formatting functions
|
||||||
number_formatter = lambda v: '{:,}'.format(v)
|
def number_formatter(v):
|
||||||
bytes_formatter = lambda v: do_filesizeformat(v, True)
|
return "{:,}".format(v)
|
||||||
|
|
||||||
|
|
||||||
|
def bytes_formatter(v):
|
||||||
|
do_filesizeformat(v, True)
|
||||||
|
|
||||||
|
|
||||||
def tables_piechart(db, by_field, value_formatter):
|
def tables_piechart(db, by_field, value_formatter):
|
||||||
'''
|
"""
|
||||||
Generate a pie chart of the top n tables in the database.
|
Generate a pie chart of the top n tables in the database.
|
||||||
`db` - the database instance
|
`db` - the database instance
|
||||||
`by_field` - the field name to sort by
|
`by_field` - the field name to sort by
|
||||||
`value_formatter` - a function to use for formatting the numeric values
|
`value_formatter` - a function to use for formatting the numeric values
|
||||||
'''
|
"""
|
||||||
Tables = db.get_model_for_table('tables', system_table=True)
|
Tables = db.get_model_for_table("tables", system_table=True)
|
||||||
qs = Tables.objects_in(db).filter(database=db.db_name, is_temporary=False).exclude(engine='Buffer')
|
qs = Tables.objects_in(db).filter(database=db.db_name, is_temporary=False).exclude(engine="Buffer")
|
||||||
tuples = [(getattr(table, by_field), table.name) for table in qs]
|
tuples = [(getattr(table, by_field), table.name) for table in qs]
|
||||||
return _generate_piechart(tuples, value_formatter)
|
return _generate_piechart(tuples, value_formatter)
|
||||||
|
|
||||||
|
|
||||||
def columns_piechart(db, tbl_name, by_field, value_formatter):
|
def columns_piechart(db, tbl_name, by_field, value_formatter):
|
||||||
'''
|
"""
|
||||||
Generate a pie chart of the top n columns in the table.
|
Generate a pie chart of the top n columns in the table.
|
||||||
`db` - the database instance
|
`db` - the database instance
|
||||||
`tbl_name` - the table name
|
`tbl_name` - the table name
|
||||||
`by_field` - the field name to sort by
|
`by_field` - the field name to sort by
|
||||||
`value_formatter` - a function to use for formatting the numeric values
|
`value_formatter` - a function to use for formatting the numeric values
|
||||||
'''
|
"""
|
||||||
ColumnsTable = db.get_model_for_table('columns', system_table=True)
|
ColumnsTable = db.get_model_for_table("columns", system_table=True)
|
||||||
qs = ColumnsTable.objects_in(db).filter(database=db.db_name, table=tbl_name)
|
qs = ColumnsTable.objects_in(db).filter(database=db.db_name, table=tbl_name)
|
||||||
tuples = [(getattr(col, by_field), col.name) for col in qs]
|
tuples = [(getattr(col, by_field), col.name) for col in qs]
|
||||||
return _generate_piechart(tuples, value_formatter)
|
return _generate_piechart(tuples, value_formatter)
|
||||||
|
|
||||||
|
|
||||||
def _get_top_tuples(tuples, n=15):
|
def _get_top_tuples(tuples, n=15):
|
||||||
'''
|
"""
|
||||||
Given a list of tuples (value, name), this function sorts
|
Given a list of tuples (value, name), this function sorts
|
||||||
the list and returns only the top n results. All other tuples
|
the list and returns only the top n results. All other tuples
|
||||||
are aggregated to a single "others" tuple.
|
are aggregated to a single "others" tuple.
|
||||||
'''
|
"""
|
||||||
non_zero_tuples = [t for t in tuples if t[0]]
|
non_zero_tuples = [t for t in tuples if t[0]]
|
||||||
sorted_tuples = sorted(non_zero_tuples, reverse=True)
|
sorted_tuples = sorted(non_zero_tuples, reverse=True)
|
||||||
if len(sorted_tuples) > n:
|
if len(sorted_tuples) > n:
|
||||||
others = (sum(t[0] for t in sorted_tuples[n:]), 'others')
|
others = (sum(t[0] for t in sorted_tuples[n:]), "others")
|
||||||
sorted_tuples = sorted_tuples[:n] + [others]
|
sorted_tuples = sorted_tuples[:n] + [others]
|
||||||
return sorted_tuples
|
return sorted_tuples
|
||||||
|
|
||||||
|
|
||||||
def _generate_piechart(tuples, value_formatter):
|
def _generate_piechart(tuples, value_formatter):
|
||||||
'''
|
"""
|
||||||
Generates a pie chart.
|
Generates a pie chart.
|
||||||
`tuples` - a list of (value, name) tuples to include in the chart
|
`tuples` - a list of (value, name) tuples to include in the chart
|
||||||
`value_formatter` - a function to use for formatting the values
|
`value_formatter` - a function to use for formatting the values
|
||||||
'''
|
"""
|
||||||
style = RotateStyle('#9e6ffe', background='white', legend_font_family='Roboto', legend_font_size=18, tooltip_font_family='Roboto', tooltip_font_size=24)
|
style = RotateStyle(
|
||||||
chart = pygal.Pie(style=style, margin=0, title=' ', value_formatter=value_formatter, truncate_legend=-1)
|
"#9e6ffe",
|
||||||
|
background="white",
|
||||||
|
legend_font_family="Roboto",
|
||||||
|
legend_font_size=18,
|
||||||
|
tooltip_font_family="Roboto",
|
||||||
|
tooltip_font_size=24,
|
||||||
|
)
|
||||||
|
chart = pygal.Pie(style=style, margin=0, title=" ", value_formatter=value_formatter, truncate_legend=-1)
|
||||||
for t in _get_top_tuples(tuples):
|
for t in _get_top_tuples(tuples):
|
||||||
chart.add(t[1], t[0])
|
chart.add(t[1], t[0])
|
||||||
return chart.render(is_unicode=True, disable_xml_declaration=True)
|
return chart.render(is_unicode=True, disable_xml_declaration=True)
|
||||||
|
|
|
@ -1,87 +1,93 @@
|
||||||
from clickhouse_orm import Database, F
|
|
||||||
from charts import tables_piechart, columns_piechart, number_formatter, bytes_formatter
|
|
||||||
from flask import Flask
|
|
||||||
from flask import render_template
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from charts import bytes_formatter, columns_piechart, number_formatter, tables_piechart
|
||||||
|
from flask import Flask, render_template
|
||||||
|
|
||||||
|
from clickhouse_orm import Database, F
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/')
|
@app.route("/")
|
||||||
def homepage_view():
|
def homepage_view():
|
||||||
'''
|
"""
|
||||||
Root view that lists all databases.
|
Root view that lists all databases.
|
||||||
'''
|
"""
|
||||||
db = _get_db('system')
|
db = _get_db("system")
|
||||||
# Get all databases in the system.databases table
|
# Get all databases in the system.databases table
|
||||||
DatabasesTable = db.get_model_for_table('databases', system_table=True)
|
DatabasesTable = db.get_model_for_table("databases", system_table=True)
|
||||||
databases = DatabasesTable.objects_in(db).exclude(name='system')
|
databases = DatabasesTable.objects_in(db).exclude(name="system")
|
||||||
databases = databases.order_by(F.lower(DatabasesTable.name))
|
databases = databases.order_by(F.lower(DatabasesTable.name))
|
||||||
# Generate the page
|
# Generate the page
|
||||||
return render_template('homepage.html', db=db, databases=databases)
|
return render_template("homepage.html", db=db, databases=databases)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/<db_name>/')
|
@app.route("/<db_name>/")
|
||||||
def database_view(db_name):
|
def database_view(db_name):
|
||||||
'''
|
"""
|
||||||
A view that displays information about a single database.
|
A view that displays information about a single database.
|
||||||
'''
|
"""
|
||||||
db = _get_db(db_name)
|
db = _get_db(db_name)
|
||||||
# Get all the tables in the database, by aggregating information from system.columns
|
# Get all the tables in the database, by aggregating information from system.columns
|
||||||
ColumnsTable = db.get_model_for_table('columns', system_table=True)
|
ColumnsTable = db.get_model_for_table("columns", system_table=True)
|
||||||
tables = ColumnsTable.objects_in(db).filter(database=db_name).aggregate(
|
tables = (
|
||||||
ColumnsTable.table,
|
ColumnsTable.objects_in(db)
|
||||||
compressed_size=F.sum(ColumnsTable.data_compressed_bytes),
|
.filter(database=db_name)
|
||||||
uncompressed_size=F.sum(ColumnsTable.data_uncompressed_bytes),
|
.aggregate(
|
||||||
ratio=F.sum(ColumnsTable.data_uncompressed_bytes) / F.sum(ColumnsTable.data_compressed_bytes)
|
ColumnsTable.table,
|
||||||
|
compressed_size=F.sum(ColumnsTable.data_compressed_bytes),
|
||||||
|
uncompressed_size=F.sum(ColumnsTable.data_uncompressed_bytes),
|
||||||
|
ratio=F.sum(ColumnsTable.data_uncompressed_bytes) / F.sum(ColumnsTable.data_compressed_bytes),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
tables = tables.order_by(F.lower(ColumnsTable.table))
|
tables = tables.order_by(F.lower(ColumnsTable.table))
|
||||||
# Generate the page
|
# Generate the page
|
||||||
return render_template('database.html',
|
return render_template(
|
||||||
|
"database.html",
|
||||||
db=db,
|
db=db,
|
||||||
tables=tables,
|
tables=tables,
|
||||||
tables_piechart_by_rows=tables_piechart(db, 'total_rows', value_formatter=number_formatter),
|
tables_piechart_by_rows=tables_piechart(db, "total_rows", value_formatter=number_formatter),
|
||||||
tables_piechart_by_size=tables_piechart(db, 'total_bytes', value_formatter=bytes_formatter),
|
tables_piechart_by_size=tables_piechart(db, "total_bytes", value_formatter=bytes_formatter),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/<db_name>/<tbl_name>/')
|
@app.route("/<db_name>/<tbl_name>/")
|
||||||
def table_view(db_name, tbl_name):
|
def table_view(db_name, tbl_name):
|
||||||
'''
|
"""
|
||||||
A view that displays information about a single table.
|
A view that displays information about a single table.
|
||||||
'''
|
"""
|
||||||
db = _get_db(db_name)
|
db = _get_db(db_name)
|
||||||
# Get table information from system.tables
|
# Get table information from system.tables
|
||||||
TablesTable = db.get_model_for_table('tables', system_table=True)
|
TablesTable = db.get_model_for_table("tables", system_table=True)
|
||||||
tbl_info = TablesTable.objects_in(db).filter(database=db_name, name=tbl_name)[0]
|
tbl_info = TablesTable.objects_in(db).filter(database=db_name, name=tbl_name)[0]
|
||||||
# Get the SQL used for creating the table
|
# Get the SQL used for creating the table
|
||||||
create_table_sql = db.raw('SHOW CREATE TABLE %s FORMAT TabSeparatedRaw' % tbl_name)
|
create_table_sql = db.raw("SHOW CREATE TABLE %s FORMAT TabSeparatedRaw" % tbl_name)
|
||||||
# Get all columns in the table from system.columns
|
# Get all columns in the table from system.columns
|
||||||
ColumnsTable = db.get_model_for_table('columns', system_table=True)
|
ColumnsTable = db.get_model_for_table("columns", system_table=True)
|
||||||
columns = ColumnsTable.objects_in(db).filter(database=db_name, table=tbl_name)
|
columns = ColumnsTable.objects_in(db).filter(database=db_name, table=tbl_name)
|
||||||
# Generate the page
|
# Generate the page
|
||||||
return render_template('table.html',
|
return render_template(
|
||||||
|
"table.html",
|
||||||
db=db,
|
db=db,
|
||||||
tbl_name=tbl_name,
|
tbl_name=tbl_name,
|
||||||
tbl_info=tbl_info,
|
tbl_info=tbl_info,
|
||||||
create_table_sql=create_table_sql,
|
create_table_sql=create_table_sql,
|
||||||
columns=columns,
|
columns=columns,
|
||||||
piechart=columns_piechart(db, tbl_name, 'data_compressed_bytes', value_formatter=bytes_formatter),
|
piechart=columns_piechart(db, tbl_name, "data_compressed_bytes", value_formatter=bytes_formatter),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _get_db(db_name):
|
def _get_db(db_name):
|
||||||
'''
|
"""
|
||||||
Returns a Database instance using connection information
|
Returns a Database instance using connection information
|
||||||
from the command line arguments (optional).
|
from the command line arguments (optional).
|
||||||
'''
|
"""
|
||||||
db_url = sys.argv[1] if len(sys.argv) > 1 else 'http://localhost:8123/'
|
db_url = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8123/"
|
||||||
username = sys.argv[2] if len(sys.argv) > 2 else None
|
username = sys.argv[2] if len(sys.argv) > 2 else None
|
||||||
password = sys.argv[3] if len(sys.argv) > 3 else None
|
password = sys.argv[3] if len(sys.argv) > 3 else None
|
||||||
return Database(db_name, db_url, username, password, readonly=True)
|
return Database(db_name, db_url, username, password, readonly=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
_get_db('system') # fail early on db connection problems
|
_get_db("system") # fail early on db connection problems
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
|
@ -1,27 +1,28 @@
|
||||||
import requests
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def download_ebook(id):
|
def download_ebook(id):
|
||||||
print(id, end=' ')
|
print(id, end=" ")
|
||||||
# Download the ebook's text
|
# Download the ebook's text
|
||||||
r = requests.get('https://www.gutenberg.org/files/{id}/{id}-0.txt'.format(id=id))
|
r = requests.get("https://www.gutenberg.org/files/{id}/{id}-0.txt".format(id=id))
|
||||||
if r.status_code == 404:
|
if r.status_code == 404:
|
||||||
print('NOT FOUND, SKIPPING')
|
print("NOT FOUND, SKIPPING")
|
||||||
return
|
return
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
# Find the ebook's title
|
# Find the ebook's title
|
||||||
text = r.content.decode('utf-8')
|
text = r.content.decode("utf-8")
|
||||||
for line in text.splitlines():
|
for line in text.splitlines():
|
||||||
if line.startswith('Title:'):
|
if line.startswith("Title:"):
|
||||||
title = line[6:].strip()
|
title = line[6:].strip()
|
||||||
print(title)
|
print(title)
|
||||||
# Save the ebook
|
# Save the ebook
|
||||||
with open('ebooks/{}.txt'.format(title), 'wb') as f:
|
with open("ebooks/{}.txt".format(title), "wb") as f:
|
||||||
f.write(r.content)
|
f.write(r.content)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
os.makedirs('ebooks', exist_ok=True)
|
os.makedirs("ebooks", exist_ok=True)
|
||||||
for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:
|
for i in [1342, 11, 84, 2701, 25525, 1661, 98, 74, 43, 215, 1400, 76]:
|
||||||
download_ebook(i)
|
download_ebook(i)
|
||||||
|
|
|
@ -1,61 +1,64 @@
|
||||||
import sys
|
import sys
|
||||||
import nltk
|
|
||||||
from nltk.stem.porter import PorterStemmer
|
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from clickhouse_orm import Database
|
|
||||||
|
import nltk
|
||||||
from models import Fragment
|
from models import Fragment
|
||||||
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
|
from clickhouse_orm import Database
|
||||||
|
|
||||||
|
|
||||||
def trim_punctuation(word):
|
def trim_punctuation(word):
|
||||||
'''
|
"""
|
||||||
Trim punctuation characters from the beginning and end of the word
|
Trim punctuation characters from the beginning and end of the word
|
||||||
'''
|
"""
|
||||||
start = end = len(word)
|
start = end = len(word)
|
||||||
for i in range(len(word)):
|
for i in range(len(word)):
|
||||||
if word[i].isalnum():
|
if word[i].isalnum():
|
||||||
start = min(start, i)
|
start = min(start, i)
|
||||||
end = i + 1
|
end = i + 1
|
||||||
return word[start : end]
|
return word[start:end]
|
||||||
|
|
||||||
|
|
||||||
def parse_file(filename):
|
def parse_file(filename):
|
||||||
'''
|
"""
|
||||||
Parses a text file at the give path.
|
Parses a text file at the give path.
|
||||||
Returns a generator of tuples (original_word, stemmed_word)
|
Returns a generator of tuples (original_word, stemmed_word)
|
||||||
The original_word may include punctuation characters.
|
The original_word may include punctuation characters.
|
||||||
'''
|
"""
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
with open(filename, 'r', encoding='utf-8') as f:
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
for word in line.split():
|
for word in line.split():
|
||||||
yield (word, stemmer.stem(trim_punctuation(word)))
|
yield (word, stemmer.stem(trim_punctuation(word)))
|
||||||
|
|
||||||
|
|
||||||
def get_fragments(filename):
|
def get_fragments(filename):
|
||||||
'''
|
"""
|
||||||
Converts a text file at the given path to a generator
|
Converts a text file at the given path to a generator
|
||||||
of Fragment instances.
|
of Fragment instances.
|
||||||
'''
|
"""
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
document = path.splitext(path.basename(filename))[0]
|
document = path.splitext(path.basename(filename))[0]
|
||||||
idx = 0
|
idx = 0
|
||||||
for word, stem in parse_file(filename):
|
for word, stem in parse_file(filename):
|
||||||
idx += 1
|
idx += 1
|
||||||
yield Fragment(document=document, idx=idx, word=word, stem=stem)
|
yield Fragment(document=document, idx=idx, word=word, stem=stem)
|
||||||
print('{} - {} words'.format(filename, idx))
|
print("{} - {} words".format(filename, idx))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# Load NLTK data if necessary
|
# Load NLTK data if necessary
|
||||||
nltk.download('punkt')
|
nltk.download("punkt")
|
||||||
nltk.download('wordnet')
|
nltk.download("wordnet")
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database
|
||||||
db = Database('default')
|
db = Database("default")
|
||||||
db.create_table(Fragment)
|
db.create_table(Fragment)
|
||||||
|
|
||||||
# Load files from the command line or everything under ebooks/
|
# Load files from the command line or everything under ebooks/
|
||||||
filenames = sys.argv[1:] or glob('ebooks/*.txt')
|
filenames = sys.argv[1:] or glob("ebooks/*.txt")
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
db.insert(get_fragments(filename), batch_size=100000)
|
db.insert(get_fragments(filename), batch_size=100000)
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
from clickhouse_orm import *
|
from clickhouse_orm.engines import MergeTree
|
||||||
|
from clickhouse_orm.fields import LowCardinalityField, StringField, UInt64Field
|
||||||
|
from clickhouse_orm.models import Index, Model
|
||||||
|
|
||||||
|
|
||||||
class Fragment(Model):
|
class Fragment(Model):
|
||||||
|
|
||||||
language = LowCardinalityField(StringField(), default='EN')
|
language = LowCardinalityField(StringField(), default="EN")
|
||||||
document = LowCardinalityField(StringField())
|
document = LowCardinalityField(StringField())
|
||||||
idx = UInt64Field()
|
idx = UInt64Field()
|
||||||
word = StringField()
|
word = StringField()
|
||||||
stem = StringField()
|
stem = StringField()
|
||||||
|
|
||||||
# An index for faster search by document and fragment idx
|
# An index for faster search by document and fragment idx
|
||||||
index = Index((document, idx), type=Index.minmax(), granularity=1)
|
index = Index((document, idx), type=Index.minmax(), granularity=1)
|
||||||
|
|
||||||
# The primary key allows efficient lookup of stems
|
# The primary key allows efficient lookup of stems
|
||||||
engine = MergeTree(order_by=(stem, document, idx), partition_key=('language',))
|
engine = MergeTree(order_by=(stem, document, idx), partition_key=("language",))
|
||||||
|
|
|
@ -1,19 +1,20 @@
|
||||||
import sys
|
import sys
|
||||||
from colorama import init, Fore, Back, Style
|
|
||||||
from nltk.stem.porter import PorterStemmer
|
|
||||||
from clickhouse_orm import Database, F
|
|
||||||
from models import Fragment
|
|
||||||
from load import trim_punctuation
|
|
||||||
|
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
from load import trim_punctuation
|
||||||
|
from models import Fragment
|
||||||
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
|
from clickhouse_orm import Database, F
|
||||||
|
|
||||||
# The wildcard character
|
# The wildcard character
|
||||||
WILDCARD = '*'
|
WILDCARD = "*"
|
||||||
|
|
||||||
|
|
||||||
def prepare_search_terms(text):
|
def prepare_search_terms(text):
|
||||||
'''
|
"""
|
||||||
Convert the text to search into a list of stemmed words.
|
Convert the text to search into a list of stemmed words.
|
||||||
'''
|
"""
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
stems = []
|
stems = []
|
||||||
for word in text.split():
|
for word in text.split():
|
||||||
|
@ -25,10 +26,10 @@ def prepare_search_terms(text):
|
||||||
|
|
||||||
|
|
||||||
def build_query(db, stems):
|
def build_query(db, stems):
|
||||||
'''
|
"""
|
||||||
Returns a queryset instance for finding sequences of Fragment instances
|
Returns a queryset instance for finding sequences of Fragment instances
|
||||||
that matche the list of stemmed words.
|
that matche the list of stemmed words.
|
||||||
'''
|
"""
|
||||||
# Start by searching for the first stemmed word
|
# Start by searching for the first stemmed word
|
||||||
all_fragments = Fragment.objects_in(db)
|
all_fragments = Fragment.objects_in(db)
|
||||||
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
|
query = all_fragments.filter(stem=stems[0]).only(Fragment.document, Fragment.idx)
|
||||||
|
@ -47,44 +48,44 @@ def build_query(db, stems):
|
||||||
|
|
||||||
|
|
||||||
def get_matching_text(db, document, from_idx, to_idx, extra=5):
|
def get_matching_text(db, document, from_idx, to_idx, extra=5):
|
||||||
'''
|
"""
|
||||||
Reconstructs the document text between the given indexes (inclusive),
|
Reconstructs the document text between the given indexes (inclusive),
|
||||||
plus `extra` words before and after the match. The words that are
|
plus `extra` words before and after the match. The words that are
|
||||||
included in the given range are highlighted in green.
|
included in the given range are highlighted in green.
|
||||||
'''
|
"""
|
||||||
text = []
|
text = []
|
||||||
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
|
conds = (Fragment.document == document) & (Fragment.idx >= from_idx - extra) & (Fragment.idx <= to_idx + extra)
|
||||||
for fragment in Fragment.objects_in(db).filter(conds).order_by('document', 'idx'):
|
for fragment in Fragment.objects_in(db).filter(conds).order_by("document", "idx"):
|
||||||
word = fragment.word
|
word = fragment.word
|
||||||
if fragment.idx == from_idx:
|
if fragment.idx == from_idx:
|
||||||
word = Fore.GREEN + word
|
word = Fore.GREEN + word
|
||||||
if fragment.idx == to_idx:
|
if fragment.idx == to_idx:
|
||||||
word = word + Style.RESET_ALL
|
word = word + Style.RESET_ALL
|
||||||
text.append(word)
|
text.append(word)
|
||||||
return ' '.join(text)
|
return " ".join(text)
|
||||||
|
|
||||||
|
|
||||||
def find(db, text):
|
def find(db, text):
|
||||||
'''
|
"""
|
||||||
Performs the search for the given text, and prints out the matches.
|
Performs the search for the given text, and prints out the matches.
|
||||||
'''
|
"""
|
||||||
stems = prepare_search_terms(text)
|
stems = prepare_search_terms(text)
|
||||||
query = build_query(db, stems)
|
query = build_query(db, stems)
|
||||||
print('\n' + Fore.MAGENTA + str(query) + Style.RESET_ALL + '\n')
|
print("\n" + Fore.MAGENTA + str(query) + Style.RESET_ALL + "\n")
|
||||||
for match in query:
|
for match in query:
|
||||||
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
|
text = get_matching_text(db, match.document, match.idx, match.idx + len(stems) - 1)
|
||||||
print(Fore.CYAN + match.document + ':' + Style.RESET_ALL, text)
|
print(Fore.CYAN + match.document + ":" + Style.RESET_ALL, text)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# Initialize colored output
|
# Initialize colored output
|
||||||
init()
|
init()
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database
|
||||||
db = Database('default')
|
db = Database("default")
|
||||||
|
|
||||||
# Search
|
# Search
|
||||||
text = ' '.join(sys.argv[1:])
|
text = " ".join(sys.argv[1:])
|
||||||
if text:
|
if text:
|
||||||
find(db, text)
|
find(db, text)
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 120
|
line-length = 120
|
||||||
extend-exclude = '''
|
|
||||||
/(
|
|
||||||
| examples
|
|
||||||
)/
|
|
||||||
'''
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
multi_line_output = 3
|
multi_line_output = 3
|
||||||
|
|
Loading…
Reference in New Issue
Block a user