mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Using ftfy for handling broken encoded strings.
This commit is contained in:
parent
2bd89e7ade
commit
4a06a2572c
|
@ -11,4 +11,5 @@ ujson>=1.35
|
|||
dill>=0.2,<0.3
|
||||
requests>=2.13.0,<3.0.0
|
||||
regex==2017.4.5
|
||||
ftfy==4.4.2
|
||||
pytest>=3.0.6,<4.0.0
|
||||
|
|
3
setup.py
3
setup.py
|
@ -248,7 +248,8 @@ def setup_package():
|
|||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3',
|
||||
'requests>=2.13.0,<3.0.0',
|
||||
'regex==2017.4.5'],
|
||||
'regex==2017.4.5',
|
||||
'ftfy == 4.4.2'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
|
|
|
@ -6,6 +6,7 @@ import math
|
|||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
from preshed.counter import PreshCounter
|
||||
import ftfy
|
||||
|
||||
from ..vocab import write_binary_vectors
|
||||
from .. import util
|
||||
|
@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob):
|
|||
with oov_path.open('w') as f:
|
||||
f.write('%f' % oov_prob)
|
||||
if vectors_path:
|
||||
vectors_dest = model_path / 'vec.bin'
|
||||
vectors_dest = vocab_path / 'vec.bin'
|
||||
write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
|
||||
|
||||
|
||||
|
@ -76,6 +77,7 @@ def read_clusters(clusters_path):
|
|||
for line in f:
|
||||
try:
|
||||
cluster, word, freq = line.split()
|
||||
word = ftfy.fix_text(word)
|
||||
except ValueError:
|
||||
continue
|
||||
# If the clusterer has only seen the word a few times, its
|
||||
|
|
Loading…
Reference in New Issue
Block a user