From 4a06a2572c89d6a116bb23b7d16236b37b4061a0 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Thu, 20 Apr 2017 13:34:51 +0200 Subject: [PATCH] Using ftfy for handling broken encoded strings. --- requirements.txt | 1 + setup.py | 3 ++- spacy/cli/model.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6212ab3cd..0108c5621 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +ftfy==4.4.2 pytest>=3.0.6,<4.0.0 diff --git a/setup.py b/setup.py index f7cd0ddcb..e343c3208 100644 --- a/setup.py +++ b/setup.py @@ -248,7 +248,8 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5'], + 'regex==2017.4.5', + 'ftfy == 4.4.2'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/cli/model.py b/spacy/cli/model.py index d697df05b..486bbea2f 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -6,6 +6,7 @@ import math from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter +import ftfy from ..vocab import write_binary_vectors from .. import util @@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob): with oov_path.open('w') as f: f.write('%f' % oov_prob) if vectors_path: - vectors_dest = model_path / 'vec.bin' + vectors_dest = vocab_path / 'vec.bin' write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) @@ -76,6 +77,7 @@ def read_clusters(clusters_path): for line in f: try: cluster, word, freq = line.split() + word = ftfy.fix_text(word) except ValueError: continue # If the clusterer has only seen the word a few times, its