mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
6d301fa06f
2
setup.py
2
setup.py
|
@ -183,7 +183,7 @@ def setup_package():
|
||||||
name=about['__title__'],
|
name=about['__title__'],
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
packages=PACKAGES,
|
packages=PACKAGES,
|
||||||
package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens', 'data']},
|
package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']},
|
||||||
description=about['__summary__'],
|
description=about['__summary__'],
|
||||||
long_description=readme,
|
long_description=readme,
|
||||||
author=about['__author__'],
|
author=about['__author__'],
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy'
|
__title__ = 'spacy'
|
||||||
__version__ = '1.0.4'
|
__version__ = '1.0.5'
|
||||||
__summary__ = 'Industrial-strength NLP'
|
__summary__ = 'Industrial-strength NLP'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Matthew Honnibal'
|
__author__ = 'Matthew Honnibal'
|
||||||
|
|
|
@ -10,10 +10,7 @@ from os import path
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
try:
|
import ujson as json
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
|
||||||
|
@ -149,7 +146,7 @@ def read_json_file(loc, docs_filter=None):
|
||||||
for filename in os.listdir(loc):
|
for filename in os.listdir(loc):
|
||||||
yield from read_json_file(path.join(loc, filename))
|
yield from read_json_file(path.join(loc, filename))
|
||||||
else:
|
else:
|
||||||
with open(loc) as file_:
|
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||||
docs = json.load(file_)
|
docs = json.load(file_)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
|
|
|
@ -2,10 +2,7 @@ from __future__ import unicode_literals, print_function
|
||||||
import codecs
|
import codecs
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
try:
|
import ujson as json
|
||||||
import ujson as json
|
|
||||||
except ImportError:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
||||||
|
|
||||||
|
@ -28,7 +25,7 @@ class Lemmatizer(object):
|
||||||
exc[pos] = read_exc(file_)
|
exc[pos] = read_exc(file_)
|
||||||
else:
|
else:
|
||||||
exc[pos] = {}
|
exc[pos] = {}
|
||||||
with (path / 'vocab' / 'lemma_rules.json').open('rb') as file_:
|
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
|
||||||
rules = json.load(file_)
|
rules = json.load(file_)
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
|
|
|
@ -175,7 +175,7 @@ cdef class Matcher:
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab):
|
def load(cls, path, vocab):
|
||||||
if (path / 'gazetteer.json').exists():
|
if (path / 'gazetteer.json').exists():
|
||||||
with (path / 'gazetteer.json').open('rb') as file_:
|
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||||
patterns = json.load(file_)
|
patterns = json.load(file_)
|
||||||
else:
|
else:
|
||||||
patterns = {}
|
patterns = {}
|
||||||
|
|
|
@ -57,24 +57,24 @@ cdef class Vocab:
|
||||||
"vectors argument to Vocab.load() deprecated. "
|
"vectors argument to Vocab.load() deprecated. "
|
||||||
"Install vectors after loading.")
|
"Install vectors after loading.")
|
||||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
||||||
with (path / 'vocab' / 'tag_map.json').open() as file_:
|
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
||||||
tag_map = json.load(file_)
|
tag_map = json.load(file_)
|
||||||
if lex_attr_getters is not None \
|
if lex_attr_getters is not None \
|
||||||
and oov_prob is True \
|
and oov_prob is True \
|
||||||
and (path / 'vocab' / 'oov_prob').exists():
|
and (path / 'vocab' / 'oov_prob').exists():
|
||||||
with (path / 'vocab' / 'oov_prob').open() as file_:
|
with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
|
||||||
oov_prob = float(file_.read())
|
oov_prob = float(file_.read())
|
||||||
lex_attr_getters[PROB] = lambda text: oov_prob
|
lex_attr_getters[PROB] = lambda text: oov_prob
|
||||||
if lemmatizer is True:
|
if lemmatizer is True:
|
||||||
lemmatizer = Lemmatizer.load(path)
|
lemmatizer = Lemmatizer.load(path)
|
||||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
||||||
with (path / 'vocab' / 'serializer.json').open() as file_:
|
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
|
||||||
serializer_freqs = json.load(file_)
|
serializer_freqs = json.load(file_)
|
||||||
|
|
||||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||||
|
|
||||||
with (path / 'vocab' / 'strings.json').open() as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
self.strings.load(file_)
|
self.strings.load(file_)
|
||||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
||||||
return self
|
return self
|
||||||
|
|
Loading…
Reference in New Issue
Block a user