From 4a06a2572c89d6a116bb23b7d16236b37b4061a0 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Thu, 20 Apr 2017 13:34:51 +0200 Subject: [PATCH 1/3] Using ftfy for handling broken encoded strings. --- requirements.txt | 1 + setup.py | 3 ++- spacy/cli/model.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6212ab3cd..0108c5621 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +ftfy==4.4.2 pytest>=3.0.6,<4.0.0 diff --git a/setup.py b/setup.py index f7cd0ddcb..e343c3208 100644 --- a/setup.py +++ b/setup.py @@ -248,7 +248,8 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5'], + 'regex==2017.4.5', + 'ftfy == 4.4.2'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/cli/model.py b/spacy/cli/model.py index d697df05b..486bbea2f 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -6,6 +6,7 @@ import math from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter +import ftfy from ..vocab import write_binary_vectors from .. import util @@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob): with oov_path.open('w') as f: f.write('%f' % oov_prob) if vectors_path: - vectors_dest = model_path / 'vec.bin' + vectors_dest = vocab_path / 'vec.bin' write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) @@ -76,6 +77,7 @@ def read_clusters(clusters_path): for line in f: try: cluster, word, freq = line.split() + word = ftfy.fix_text(word) except ValueError: continue # If the clusterer has only seen the word a few times, its From 40a8f22ca72f609af7eb74764d1036ffa1834d5e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Apr 2017 15:38:52 +0200 Subject: [PATCH 2/3] Relax version contraint --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0108c5621..42910d1be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,5 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 -ftfy==4.4.2 +ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 From 417f430d23cf2c1ffcc16a886694d27b62c0e04e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Apr 2017 15:39:24 +0200 Subject: [PATCH 3/3] Relax version contstraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e343c3208..69ba880eb 100644 --- a/setup.py +++ b/setup.py @@ -249,7 +249,7 @@ def setup_package(): 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', 'regex==2017.4.5', - 'ftfy == 4.4.2'], + 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console',