Using ftfy for handling broken encoded strings.

2026-01-10 18:51:21 +03:00 · 2017-04-20 13:34:51 +02:00 · 2017-04-20 13:34:51 +02:00 · 4a06a2572c
commit 4a06a2572c
parent 2bd89e7ade
3 changed files with 6 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -11,4 +11,5 @@ ujson>=1.35
 dill>=0.2,<0.3
 requests>=2.13.0,<3.0.0
 regex==2017.4.5
+ftfy==4.4.2
 pytest>=3.0.6,<4.0.0
--- a/setup.py
+++ b/setup.py
@ -248,7 +248,8 @@ def setup_package():
                'ujson>=1.35',
                'dill>=0.2,<0.3',
                'requests>=2.13.0,<3.0.0',
-                'regex==2017.4.5'],
+                'regex==2017.4.5',
+                'ftfy == 4.4.2'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@ -6,6 +6,7 @@ import math
 from ast import literal_eval
 from pathlib import Path
 from preshed.counter import PreshCounter
+import ftfy

 from ..vocab import write_binary_vectors
 from .. import util
@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob):
    with oov_path.open('w') as f:
        f.write('%f' % oov_prob)
    if vectors_path:
-        vectors_dest = model_path / 'vec.bin'
+        vectors_dest = vocab_path / 'vec.bin'
        write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())


@ -76,6 +77,7 @@ def read_clusters(clusters_path):
        for line in f:
            try:
                cluster, word, freq = line.split()
+                word = ftfy.fix_text(word)
            except ValueError:
                continue
            # If the clusterer has only seen the word a few times, its