mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Using ftfy for handling broken encoded strings.
This commit is contained in:
parent
2bd89e7ade
commit
4a06a2572c
|
@ -11,4 +11,5 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
regex==2017.4.5
|
regex==2017.4.5
|
||||||
|
ftfy==4.4.2
|
||||||
pytest>=3.0.6,<4.0.0
|
pytest>=3.0.6,<4.0.0
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -248,7 +248,8 @@ def setup_package():
|
||||||
'ujson>=1.35',
|
'ujson>=1.35',
|
||||||
'dill>=0.2,<0.3',
|
'dill>=0.2,<0.3',
|
||||||
'requests>=2.13.0,<3.0.0',
|
'requests>=2.13.0,<3.0.0',
|
||||||
'regex==2017.4.5'],
|
'regex==2017.4.5',
|
||||||
|
'ftfy == 4.4.2'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
|
|
|
@ -6,6 +6,7 @@ import math
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from preshed.counter import PreshCounter
|
from preshed.counter import PreshCounter
|
||||||
|
import ftfy
|
||||||
|
|
||||||
from ..vocab import write_binary_vectors
|
from ..vocab import write_binary_vectors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob):
|
||||||
with oov_path.open('w') as f:
|
with oov_path.open('w') as f:
|
||||||
f.write('%f' % oov_prob)
|
f.write('%f' % oov_prob)
|
||||||
if vectors_path:
|
if vectors_path:
|
||||||
vectors_dest = model_path / 'vec.bin'
|
vectors_dest = vocab_path / 'vec.bin'
|
||||||
write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
|
write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
@ -76,6 +77,7 @@ def read_clusters(clusters_path):
|
||||||
for line in f:
|
for line in f:
|
||||||
try:
|
try:
|
||||||
cluster, word, freq = line.split()
|
cluster, word, freq = line.split()
|
||||||
|
word = ftfy.fix_text(word)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
# If the clusterer has only seen the word a few times, its
|
# If the clusterer has only seen the word a few times, its
|
||||||
|
|
Loading…
Reference in New Issue
Block a user