Using ftfy for handling broken encoded strings.

This commit is contained in:
Gyorgy Orosz 2017-04-20 13:34:51 +02:00
parent 2bd89e7ade
commit 4a06a2572c
3 changed files with 6 additions and 2 deletions

View File

@ -11,4 +11,5 @@ ujson>=1.35
dill>=0.2,<0.3 dill>=0.2,<0.3
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
regex==2017.4.5 regex==2017.4.5
ftfy==4.4.2
pytest>=3.0.6,<4.0.0 pytest>=3.0.6,<4.0.0

View File

@ -248,7 +248,8 @@ def setup_package():
'ujson>=1.35', 'ujson>=1.35',
'dill>=0.2,<0.3', 'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0', 'requests>=2.13.0,<3.0.0',
'regex==2017.4.5'], 'regex==2017.4.5',
'ftfy == 4.4.2'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -6,6 +6,7 @@ import math
from ast import literal_eval from ast import literal_eval
from pathlib import Path from pathlib import Path
from preshed.counter import PreshCounter from preshed.counter import PreshCounter
import ftfy
from ..vocab import write_binary_vectors from ..vocab import write_binary_vectors
from .. import util from .. import util
@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob):
with oov_path.open('w') as f: with oov_path.open('w') as f:
f.write('%f' % oov_prob) f.write('%f' % oov_prob)
if vectors_path: if vectors_path:
vectors_dest = model_path / 'vec.bin' vectors_dest = vocab_path / 'vec.bin'
write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
@ -76,6 +77,7 @@ def read_clusters(clusters_path):
for line in f: for line in f:
try: try:
cluster, word, freq = line.split() cluster, word, freq = line.split()
word = ftfy.fix_text(word)
except ValueError: except ValueError:
continue continue
# If the clusterer has only seen the word a few times, its # If the clusterer has only seen the word a few times, its