mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-13 10:00:34 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
b3b5521625
|
@ -27,7 +27,7 @@ def info(cmd, model=None, markdown=False):
|
||||||
meta_path = model_path / 'meta.json'
|
meta_path = model_path / 'meta.json'
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
util.prints(meta_path, title="Can't find model meta.json", exits=1)
|
util.prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||||
meta = read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta['link'] = path2str(model_path)
|
meta['link'] = path2str(model_path)
|
||||||
meta['source'] = path2str(model_path.resolve())
|
meta['source'] = path2str(model_path.resolve())
|
||||||
|
|
|
@ -15,9 +15,9 @@ def noun_chunks(obj):
|
||||||
# and not just "eine Tasse", same for "das Thema Familie".
|
# and not just "eine Tasse", same for "das Thema Familie".
|
||||||
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
labels = ['sb', 'oa', 'da', 'nk', 'mo', 'ag', 'ROOT', 'root', 'cj', 'pd', 'og', 'app']
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_label = doc.vocab.strings['NP']
|
np_label = doc.vocab.strings.add('NP')
|
||||||
np_deps = set(doc.vocab.strings[label] for label in labels)
|
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||||
close_app = doc.vocab.strings['nk']
|
close_app = doc.vocab.strings.add('nk')
|
||||||
|
|
||||||
rbracket = 0
|
rbracket = 0
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
|
|
|
@ -31,7 +31,7 @@ class EnglishDefaults(Language.Defaults):
|
||||||
lemma_rules = dict(LEMMA_RULES)
|
lemma_rules = dict(LEMMA_RULES)
|
||||||
lemma_index = dict(LEMMA_INDEX)
|
lemma_index = dict(LEMMA_INDEX)
|
||||||
lemma_exc = dict(LEMMA_EXC)
|
lemma_exc = dict(LEMMA_EXC)
|
||||||
sytax_iterators = dict(SYNTAX_ITERATORS)
|
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
|
|
||||||
class English(Language):
|
class English(Language):
|
||||||
|
|
|
@ -11,9 +11,9 @@ def noun_chunks(obj):
|
||||||
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
|
||||||
'attr', 'ROOT']
|
'attr', 'ROOT']
|
||||||
doc = obj.doc # Ensure works on both Doc and Span.
|
doc = obj.doc # Ensure works on both Doc and Span.
|
||||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||||
conj = doc.vocab.strings['conj']
|
conj = doc.vocab.strings.add('conj')
|
||||||
np_label = doc.vocab.strings['NP']
|
np_label = doc.vocab.strings.add('NP')
|
||||||
seen = set()
|
seen = set()
|
||||||
for i, word in enumerate(obj):
|
for i, word in enumerate(obj):
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
if word.pos not in (NOUN, PROPN, PRON):
|
||||||
|
|
|
@ -5,6 +5,7 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lemmatizer import LOOKUP
|
from .lemmatizer import LOOKUP
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -22,6 +23,7 @@ class SpanishDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
tag_map = dict(TAG_MAP)
|
tag_map = dict(TAG_MAP)
|
||||||
stop_words = set(STOP_WORDS)
|
stop_words = set(STOP_WORDS)
|
||||||
|
sytax_iterators = dict(SYNTAX_ITERATORS)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_lemmatizer(cls, nlp=None):
|
def create_lemmatizer(cls, nlp=None):
|
||||||
|
|
|
@ -9,7 +9,8 @@ LIST_ICONS = [r'[\p{So}--[°]]']
|
||||||
_currency = r'\$|¢|£|€|¥|฿'
|
_currency = r'\$|¢|£|€|¥|฿'
|
||||||
_quotes = QUOTES.replace("'", '')
|
_quotes = QUOTES.replace("'", '')
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
[r'[,.:](?=[{a}])'.format(a=ALPHA)])
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
|
@ -21,7 +22,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -107,7 +107,8 @@ class BaseDefaults(object):
|
||||||
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
||||||
'dependencies': lambda nlp, **cfg: [
|
'dependencies': lambda nlp, **cfg: [
|
||||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
NeuralDependencyParser(nlp.vocab, **cfg),
|
||||||
nonproj.deprojectivize],
|
nonproj.deprojectivize,
|
||||||
|
],
|
||||||
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,6 +127,7 @@ class BaseDefaults(object):
|
||||||
lemma_index = {}
|
lemma_index = {}
|
||||||
morph_rules = {}
|
morph_rules = {}
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = {}
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
@ -182,6 +184,35 @@ class Language(object):
|
||||||
flat_list.append(pipe)
|
flat_list.append(pipe)
|
||||||
self.pipeline = flat_list
|
self.pipeline = flat_list
|
||||||
|
|
||||||
|
# Conveniences to access pipeline components
|
||||||
|
@property
|
||||||
|
def tensorizer(self):
|
||||||
|
return self.get_component('tensorizer')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tagger(self):
|
||||||
|
return self.get_component('tagger')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def parser(self):
|
||||||
|
return self.get_component('parser')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entity(self):
|
||||||
|
return self.get_component('ner')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def matcher(self):
|
||||||
|
return self.get_component('matcher')
|
||||||
|
|
||||||
|
def get_component(self, name):
|
||||||
|
if self.pipeline in (True, None):
|
||||||
|
return None
|
||||||
|
for proc in self.pipeline:
|
||||||
|
if hasattr(proc, 'name') and proc.name.endswith(name):
|
||||||
|
return proc
|
||||||
|
return None
|
||||||
|
|
||||||
def __call__(self, text, disable=[]):
|
def __call__(self, text, disable=[]):
|
||||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
|
|
|
@ -30,6 +30,7 @@ cdef class Morphology:
|
||||||
cdef public object n_tags
|
cdef public object n_tags
|
||||||
cdef public object reverse_index
|
cdef public object reverse_index
|
||||||
cdef public object tag_names
|
cdef public object tag_names
|
||||||
|
cdef public object exc
|
||||||
|
|
||||||
cdef RichTagC* rich_tags
|
cdef RichTagC* rich_tags
|
||||||
cdef PreshMapArray _cache
|
cdef PreshMapArray _cache
|
||||||
|
|
|
@ -33,7 +33,7 @@ def _normalize_props(props):
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = string_store
|
self.strings = string_store
|
||||||
self.tag_map = {}
|
self.tag_map = {}
|
||||||
|
@ -53,9 +53,14 @@ cdef class Morphology:
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
self.rich_tags[i].pos = attrs[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
self._cache = PreshMapArray(self.n_tags)
|
self._cache = PreshMapArray(self.n_tags)
|
||||||
|
self.exc = {}
|
||||||
|
if exc is not None:
|
||||||
|
for (tag_str, orth_str), attrs in exc.items():
|
||||||
|
self.add_special_case(tag_str, orth_str, attrs)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Morphology, (self.strings, self.tag_map, self.lemmatizer), None, None)
|
return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
|
||||||
|
self.exc), None, None)
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
|
@ -106,6 +111,7 @@ cdef class Morphology:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
|
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||||
tag = self.strings.add(tag_str)
|
tag = self.strings.add(tag_str)
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
|
|
|
@ -233,8 +233,11 @@ class NeuralTagger(object):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
# Don't clobber preset POS tags
|
||||||
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
|
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None):
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
@ -286,7 +289,8 @@ class NeuralTagger(object):
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
if new_tag_map:
|
if new_tag_map:
|
||||||
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
vocab.morphology = Morphology(vocab.strings, new_tag_map,
|
||||||
vocab.morphology.lemmatizer)
|
vocab.morphology.lemmatizer,
|
||||||
|
exc=vocab.morphology.exc)
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
|
||||||
|
@ -322,7 +326,9 @@ class NeuralTagger(object):
|
||||||
tag_map = msgpack.loads(b, encoding='utf8')
|
tag_map = msgpack.loads(b, encoding='utf8')
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer)
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
exc=self.vocab.morphology.exc)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
|
@ -354,7 +360,9 @@ class NeuralTagger(object):
|
||||||
tag_map = msgpack.loads(file_.read(), encoding='utf8')
|
tag_map = msgpack.loads(file_.read(), encoding='utf8')
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer)
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
exc=self.vocab.morphology.exc)
|
||||||
|
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||||
|
|
|
@ -164,6 +164,7 @@ cdef class precompute_hiddens:
|
||||||
return best, backprop
|
return best, backprop
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||||
cdef int idx, b, f, i
|
cdef int idx, b, f, i
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .. import util
|
||||||
|
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||||
'nl', 'pl', 'pt', 'sv', 'xx']
|
'nl', 'pl', 'pt', 'sv', 'xx']
|
||||||
_models = {'en': ['en_core_web_sm', 'en_core_web_md'],
|
_models = {'en': ['en_depent_web_sm', 'en_core_web_md'],
|
||||||
'de': ['de_core_news_md'],
|
'de': ['de_core_news_md'],
|
||||||
'fr': ['fr_depvec_web_lg'],
|
'fr': ['fr_depvec_web_lg'],
|
||||||
'xx': ['xx_ent_web_md']}
|
'xx': ['xx_ent_web_md']}
|
||||||
|
@ -22,25 +22,29 @@ _models = {'en': ['en_core_web_sm', 'en_core_web_md'],
|
||||||
# only used for tests that require loading the models
|
# only used for tests that require loading the models
|
||||||
# in all other cases, use specific instances
|
# in all other cases, use specific instances
|
||||||
|
|
||||||
@pytest.fixture(params=_models['en'], scope="session")
|
@pytest.fixture(params=_models['en'])
|
||||||
def EN(request):
|
def EN(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_models['de'], scope="session")
|
@pytest.fixture(params=_models['de'])
|
||||||
def DE(request):
|
def DE(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_models['fr'], scope="session")
|
@pytest.fixture(params=_models['fr'])
|
||||||
def FR(request):
|
def FR(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_languages)
|
#@pytest.fixture(params=_languages)
|
||||||
def tokenizer(request):
|
#def tokenizer(request):
|
||||||
lang = util.get_lang_class(request.param)
|
#lang = util.get_lang_class(request.param)
|
||||||
return lang.Defaults.create_tokenizer()
|
#return lang.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tokenizer():
|
||||||
|
return util.get_lang_class('xx').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -68,7 +72,7 @@ def de_tokenizer():
|
||||||
return util.get_lang_class('de').Defaults.create_tokenizer()
|
return util.get_lang_class('de').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture
|
||||||
def fr_tokenizer():
|
def fr_tokenizer():
|
||||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@ -143,4 +147,4 @@ def pytest_runtest_setup(item):
|
||||||
if item.get_marker('models'):
|
if item.get_marker('models'):
|
||||||
for arg in item.get_marker('models').args:
|
for arg in item.get_marker('models').args:
|
||||||
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
|
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
|
||||||
pytest.skip()
|
pytest.skip("need --%s or --all option to run" % arg)
|
||||||
|
|
|
@ -1,72 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
class TestModelSanity:
|
|
||||||
"""
|
|
||||||
This is to make sure the model works as expected. The tests make sure that
|
|
||||||
values are properly set.
|
|
||||||
Tests are not meant to evaluate the content of the output, only make sure
|
|
||||||
the output is formally okay.
|
|
||||||
"""
|
|
||||||
@pytest.fixture(scope='class', params=['en','de'])
|
|
||||||
def example(self, request, EN, DE):
|
|
||||||
assert EN.entity != None
|
|
||||||
assert DE.entity != None
|
|
||||||
if request.param == 'en':
|
|
||||||
doc = EN(u'There was a stranger standing at the big ' +
|
|
||||||
u'street talking to herself.')
|
|
||||||
elif request.param == 'de':
|
|
||||||
doc = DE(u'An der großen Straße stand eine merkwürdige ' +
|
|
||||||
u'Gestalt und führte Selbstgespräche.')
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def test_tokenization(self, example):
|
|
||||||
# tokenization should split the document into tokens
|
|
||||||
assert len(example) > 1
|
|
||||||
|
|
||||||
def test_tagging(self, example):
|
|
||||||
# if tagging was done properly, pos tags shouldn't be empty
|
|
||||||
assert example.is_tagged
|
|
||||||
assert all( t.pos != 0 for t in example )
|
|
||||||
assert all( t.tag != 0 for t in example )
|
|
||||||
|
|
||||||
def test_parsing(self, example):
|
|
||||||
# if parsing was done properly
|
|
||||||
# - dependency labels shouldn't be empty
|
|
||||||
# - the head of some tokens should not be root
|
|
||||||
assert example.is_parsed
|
|
||||||
assert all( t.dep != 0 for t in example )
|
|
||||||
assert any( t.dep != i for i,t in enumerate(example) )
|
|
||||||
|
|
||||||
def test_ner(self, example):
|
|
||||||
# if ner was done properly, ent_iob shouldn't be empty
|
|
||||||
assert all([t.ent_iob != 0 for t in example])
|
|
||||||
|
|
||||||
def test_vectors(self, example):
|
|
||||||
# if vectors are available, they should differ on different words
|
|
||||||
# this isn't a perfect test since this could in principle fail
|
|
||||||
# in a sane model as well,
|
|
||||||
# but that's very unlikely and a good indicator if something is wrong
|
|
||||||
vector0 = example[0].vector
|
|
||||||
vector1 = example[1].vector
|
|
||||||
vector2 = example[2].vector
|
|
||||||
assert not numpy.array_equal(vector0,vector1)
|
|
||||||
assert not numpy.array_equal(vector0,vector2)
|
|
||||||
assert not numpy.array_equal(vector1,vector2)
|
|
||||||
|
|
||||||
def test_probs(self, example):
|
|
||||||
# if frequencies/probabilities are okay, they should differ for
|
|
||||||
# different words
|
|
||||||
# this isn't a perfect test since this could in principle fail
|
|
||||||
# in a sane model as well,
|
|
||||||
# but that's very unlikely and a good indicator if something is wrong
|
|
||||||
prob0 = example[0].prob
|
|
||||||
prob1 = example[1].prob
|
|
||||||
prob2 = example[2].prob
|
|
||||||
assert not prob0 == prob1
|
|
||||||
assert not prob0 == prob2
|
|
||||||
assert not prob1 == prob2
|
|
77
spacy/tests/lang/de/test_models.py
Normal file
77
spacy/tests/lang/de/test_models.py
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def example(DE):
|
||||||
|
"""
|
||||||
|
This is to make sure the model works as expected. The tests make sure that
|
||||||
|
values are properly set. Tests are not meant to evaluate the content of the
|
||||||
|
output, only make sure the output is formally okay.
|
||||||
|
"""
|
||||||
|
assert DE.entity != None
|
||||||
|
return DE('An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_tokenization(example):
|
||||||
|
# tokenization should split the document into tokens
|
||||||
|
assert len(example) > 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_tagging(example):
|
||||||
|
# if tagging was done properly, pos tags shouldn't be empty
|
||||||
|
assert example.is_tagged
|
||||||
|
assert all(t.pos != 0 for t in example)
|
||||||
|
assert all(t.tag != 0 for t in example)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_parsing(example):
|
||||||
|
# if parsing was done properly
|
||||||
|
# - dependency labels shouldn't be empty
|
||||||
|
# - the head of some tokens should not be root
|
||||||
|
assert example.is_parsed
|
||||||
|
assert all(t.dep != 0 for t in example)
|
||||||
|
assert any(t.dep != i for i,t in enumerate(example))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_ner(example):
|
||||||
|
# if ner was done properly, ent_iob shouldn't be empty
|
||||||
|
assert all([t.ent_iob != 0 for t in example])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_vectors(example):
|
||||||
|
# if vectors are available, they should differ on different words
|
||||||
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
vector0 = example[0].vector
|
||||||
|
vector1 = example[1].vector
|
||||||
|
vector2 = example[2].vector
|
||||||
|
assert not numpy.array_equal(vector0,vector1)
|
||||||
|
assert not numpy.array_equal(vector0,vector2)
|
||||||
|
assert not numpy.array_equal(vector1,vector2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models('de')
|
||||||
|
def test_de_models_probs(example):
|
||||||
|
# if frequencies/probabilities are okay, they should differ for
|
||||||
|
# different words
|
||||||
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
prob0 = example[0].prob
|
||||||
|
prob1 = example[1].prob
|
||||||
|
prob2 = example[2].prob
|
||||||
|
assert not prob0 == prob1
|
||||||
|
assert not prob0 == prob2
|
||||||
|
assert not prob1 == prob2
|
|
@ -110,7 +110,6 @@ def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
||||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
|
|
|
@ -26,12 +26,12 @@ def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||||
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
||||||
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_punct(en_lemmatizer):
|
def test_en_lemmatizer_punct(en_lemmatizer):
|
||||||
assert en_lemmatizer.punct('“') == set(['"'])
|
assert en_lemmatizer.punct('“') == set(['"'])
|
||||||
assert en_lemmatizer.punct('“') == set(['"'])
|
assert en_lemmatizer.punct('“') == set(['"'])
|
||||||
|
@ -40,7 +40,8 @@ def test_en_lemmatizer_punct(en_lemmatizer):
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_lemmatizer_lemma_assignment(EN):
|
def test_en_lemmatizer_lemma_assignment(EN):
|
||||||
text = "Bananas in pyjamas are geese."
|
text = "Bananas in pyjamas are geese."
|
||||||
doc = EN.tokenizer(text)
|
doc = EN.make_doc(text)
|
||||||
|
EN.tensorizer(doc)
|
||||||
assert all(t.lemma_ == '' for t in doc)
|
assert all(t.lemma_ == '' for t in doc)
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
assert all(t.lemma_ != '' for t in doc)
|
assert all(t.lemma_ != '' for t in doc)
|
||||||
|
|
76
spacy/tests/lang/en/test_models.py
Normal file
76
spacy/tests/lang/en/test_models.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def example(EN):
|
||||||
|
"""
|
||||||
|
This is to make sure the model works as expected. The tests make sure that
|
||||||
|
values are properly set. Tests are not meant to evaluate the content of the
|
||||||
|
output, only make sure the output is formally okay.
|
||||||
|
"""
|
||||||
|
assert EN.entity != None
|
||||||
|
return EN('There was a stranger standing at the big street talking to herself.')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_tokenization(example):
|
||||||
|
# tokenization should split the document into tokens
|
||||||
|
assert len(example) > 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_tagging(example):
|
||||||
|
# if tagging was done properly, pos tags shouldn't be empty
|
||||||
|
assert example.is_tagged
|
||||||
|
assert all(t.pos != 0 for t in example)
|
||||||
|
assert all(t.tag != 0 for t in example)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_parsing(example):
|
||||||
|
# if parsing was done properly
|
||||||
|
# - dependency labels shouldn't be empty
|
||||||
|
# - the head of some tokens should not be root
|
||||||
|
assert example.is_parsed
|
||||||
|
assert all(t.dep != 0 for t in example)
|
||||||
|
assert any(t.dep != i for i,t in enumerate(example))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_ner(example):
|
||||||
|
# if ner was done properly, ent_iob shouldn't be empty
|
||||||
|
assert all([t.ent_iob != 0 for t in example])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_vectors(example):
|
||||||
|
# if vectors are available, they should differ on different words
|
||||||
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
vector0 = example[0].vector
|
||||||
|
vector1 = example[1].vector
|
||||||
|
vector2 = example[2].vector
|
||||||
|
assert not numpy.array_equal(vector0,vector1)
|
||||||
|
assert not numpy.array_equal(vector0,vector2)
|
||||||
|
assert not numpy.array_equal(vector1,vector2)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_models_probs(example):
|
||||||
|
# if frequencies/probabilities are okay, they should differ for
|
||||||
|
# different words
|
||||||
|
# this isn't a perfect test since this could in principle fail
|
||||||
|
# in a sane model as well,
|
||||||
|
# but that's very unlikely and a good indicator if something is wrong
|
||||||
|
prob0 = example[0].prob
|
||||||
|
prob1 = example[1].prob
|
||||||
|
prob2 = example[2].prob
|
||||||
|
assert not prob0 == prob1
|
||||||
|
assert not prob0 == prob2
|
||||||
|
assert not prob1 == prob2
|
|
@ -17,15 +17,17 @@ def test_en_ner_simple_types(EN):
|
||||||
assert ents[1].label_ == 'GPE'
|
assert ents[1].label_ == 'GPE'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_ner_consistency_bug(EN):
|
def test_en_ner_consistency_bug(EN):
|
||||||
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
||||||
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
||||||
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', disable=['ner'])
|
||||||
tokens.ents += tuple(EN.matcher(tokens))
|
tokens.ents += tuple(EN.matcher(tokens))
|
||||||
EN.entity(tokens)
|
EN.entity(tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_ner_unit_end_gazetteer(EN):
|
def test_en_ner_unit_end_gazetteer(EN):
|
||||||
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
||||||
|
|
|
@ -61,21 +61,21 @@ def test_en_sbd_serialization_projective(EN):
|
||||||
|
|
||||||
|
|
||||||
TEST_CASES = [
|
TEST_CASES = [
|
||||||
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
|
pytest.mark.xfail(("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."])),
|
||||||
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
||||||
pytest.mark.xfail(("There it is! I found it.", ["There it is!", "I found it."])),
|
("There it is! I found it.", ["There it is!", "I found it."]),
|
||||||
("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
|
("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
|
||||||
("Please turn to p. 55.", ["Please turn to p. 55."]),
|
("Please turn to p. 55.", ["Please turn to p. 55."]),
|
||||||
("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
|
("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
|
||||||
("They closed the deal with Pitt, Briggs & Co. at noon.", ["They closed the deal with Pitt, Briggs & Co. at noon."]),
|
("They closed the deal with Pitt, Briggs & Co. at noon.", ["They closed the deal with Pitt, Briggs & Co. at noon."]),
|
||||||
pytest.mark.xfail(("Let's ask Jane and co. They should know.", ["Let's ask Jane and co.", "They should know."])),
|
("Let's ask Jane and co. They should know.", ["Let's ask Jane and co.", "They should know."]),
|
||||||
("They closed the deal with Pitt, Briggs & Co. It closed yesterday.", ["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."]),
|
("They closed the deal with Pitt, Briggs & Co. It closed yesterday.", ["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."]),
|
||||||
("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
|
("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
|
||||||
("St. Michael's Church is on 5th st. near the light.", ["St. Michael's Church is on 5th st. near the light."]),
|
pytest.mark.xfail(("St. Michael's Church is on 5th st. near the light.", ["St. Michael's Church is on 5th st. near the light."])),
|
||||||
("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
|
("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
|
||||||
("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
|
("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
|
||||||
pytest.mark.xfail(("I live in the E.U. How about you?", ["I live in the E.U.", "How about you?"])),
|
("I live in the E.U. How about you?", ["I live in the E.U.", "How about you?"]),
|
||||||
pytest.mark.xfail(("I live in the U.S. How about you?", ["I live in the U.S.", "How about you?"])),
|
("I live in the U.S. How about you?", ["I live in the U.S.", "How about you?"]),
|
||||||
("I work for the U.S. Government in Virginia.", ["I work for the U.S. Government in Virginia."]),
|
("I work for the U.S. Government in Virginia.", ["I work for the U.S. Government in Virginia."]),
|
||||||
("I have lived in the U.S. for 20 years.", ["I have lived in the U.S. for 20 years."]),
|
("I have lived in the U.S. for 20 years.", ["I have lived in the U.S. for 20 years."]),
|
||||||
pytest.mark.xfail(("At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."])),
|
pytest.mark.xfail(("At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."])),
|
||||||
|
@ -84,7 +84,7 @@ TEST_CASES = [
|
||||||
("He teaches science (He previously worked for 5 years as an engineer.) at the local University.", ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
|
("He teaches science (He previously worked for 5 years as an engineer.) at the local University.", ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
|
||||||
("Her email is Jane.Doe@example.com. I sent her an email.", ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
|
("Her email is Jane.Doe@example.com. I sent her an email.", ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
|
||||||
("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]),
|
("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]),
|
||||||
("She turned to him, 'This is great.' she said.", ["She turned to him, 'This is great.' she said."]),
|
pytest.mark.xfail(("She turned to him, 'This is great.' she said.", ["She turned to him, 'This is great.' she said."])),
|
||||||
pytest.mark.xfail(('She turned to him, "This is great." she said.', ['She turned to him, "This is great." she said.'])),
|
pytest.mark.xfail(('She turned to him, "This is great." she said.', ['She turned to him, "This is great." she said.'])),
|
||||||
('She turned to him, "This is great." She held the book out to show him.', ['She turned to him, "This is great."', "She held the book out to show him."]),
|
('She turned to him, "This is great." She held the book out to show him.', ['She turned to him, "This is great."', "She held the book out to show him."]),
|
||||||
("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
|
("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
|
||||||
|
@ -103,18 +103,19 @@ TEST_CASES = [
|
||||||
("This is a sentence\ncut off in the middle because pdf.", ["This is a sentence\ncut off in the middle because pdf."]),
|
("This is a sentence\ncut off in the middle because pdf.", ["This is a sentence\ncut off in the middle because pdf."]),
|
||||||
("It was a cold \nnight in the city.", ["It was a cold \nnight in the city."]),
|
("It was a cold \nnight in the city.", ["It was a cold \nnight in the city."]),
|
||||||
pytest.mark.xfail(("features\ncontact manager\nevents, activities\n", ["features", "contact manager", "events, activities"])),
|
pytest.mark.xfail(("features\ncontact manager\nevents, activities\n", ["features", "contact manager", "events, activities"])),
|
||||||
("You can find it at N°. 1026.253.553. That is where the treasure is.", ["You can find it at N°. 1026.253.553.", "That is where the treasure is."]),
|
pytest.mark.xfail(("You can find it at N°. 1026.253.553. That is where the treasure is.", ["You can find it at N°. 1026.253.553.", "That is where the treasure is."])),
|
||||||
("She works at Yahoo! in the accounting department.", ["She works at Yahoo! in the accounting department."]),
|
("She works at Yahoo! in the accounting department.", ["She works at Yahoo! in the accounting department."]),
|
||||||
pytest.mark.xfail(("We make a good team, you and I. Did you see Albert I. Jones yesterday?", ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"])),
|
("We make a good team, you and I. Did you see Albert I. Jones yesterday?", ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"]),
|
||||||
("Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", ["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]),
|
("Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", ["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]),
|
||||||
(""""Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""", ['"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).']),
|
pytest.mark.xfail((""""Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""", ['"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'])),
|
||||||
pytest.mark.xfail(("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."])),
|
("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]),
|
||||||
("I never meant that.... She left the store.", ["I never meant that....", "She left the store."]),
|
("I never meant that.... She left the store.", ["I never meant that....", "She left the store."]),
|
||||||
pytest.mark.xfail(("I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", ["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."])),
|
pytest.mark.xfail(("I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", ["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."])),
|
||||||
pytest.mark.xfail(("One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])),
|
pytest.mark.xfail(("One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])),
|
||||||
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
|
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
|
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
|
||||||
def test_en_sbd_prag(EN, text, expected_sents):
|
def test_en_sbd_prag(EN, text, expected_sents):
|
||||||
|
|
|
@ -22,7 +22,7 @@ def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_tag_names(EN):
|
def test_tag_names(EN):
|
||||||
text = "I ate pizzas with anchovies."
|
text = "I ate pizzas with anchovies."
|
||||||
doc = EN(text, parse=False, tag=True)
|
doc = EN(text, disable=['parser'])
|
||||||
assert type(doc[2].pos) == int
|
assert type(doc[2].pos) == int
|
||||||
assert isinstance(doc[2].pos_, six.text_type)
|
assert isinstance(doc[2].pos_, six.text_type)
|
||||||
assert type(doc[2].dep) == int
|
assert type(doc[2].dep) == int
|
||||||
|
@ -30,11 +30,12 @@ def test_tag_names(EN):
|
||||||
assert doc[2].tag_ == u'NNS'
|
assert doc[2].tag_ == u'NNS'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_tagger_spaces(EN):
|
def test_en_tagger_spaces(EN):
|
||||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||||
text = "Some\nspaces are\tnecessary."
|
text = "Some\nspaces are\tnecessary."
|
||||||
doc = EN(text, tag=True, parse=False)
|
doc = EN(text, disable=['parser'])
|
||||||
assert doc[0].pos != SPACE
|
assert doc[0].pos != SPACE
|
||||||
assert doc[0].pos_ != 'SPACE'
|
assert doc[0].pos_ != 'SPACE'
|
||||||
assert doc[1].pos == SPACE
|
assert doc[1].pos == SPACE
|
||||||
|
@ -45,6 +46,7 @@ def test_en_tagger_spaces(EN):
|
||||||
assert doc[4].pos == SPACE
|
assert doc[4].pos == SPACE
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_en_tagger_return_char(EN):
|
def test_en_tagger_return_char(EN):
|
||||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||||
|
|
|
@ -5,11 +5,11 @@ import pytest
|
||||||
|
|
||||||
DEFAULT_TESTS = [
|
DEFAULT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
@ -18,7 +18,9 @@ DEFAULT_TESTS = [
|
||||||
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
('Valami ...van...', ['Valami', '...', 'van', '...']),
|
||||||
('Valami...', ['Valami', '...']),
|
('Valami...', ['Valami', '...']),
|
||||||
('Valami ...', ['Valami', '...']),
|
('Valami ...', ['Valami', '...']),
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
('Valami ... más.', ['Valami', '...', 'más', '.']),
|
||||||
|
('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']),
|
||||||
|
('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?'])
|
||||||
]
|
]
|
||||||
|
|
||||||
HYPHEN_TESTS = [
|
HYPHEN_TESTS = [
|
||||||
|
@ -225,11 +227,11 @@ QUOTE_TESTS = [
|
||||||
|
|
||||||
DOT_TESTS = [
|
DOT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']),
|
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
||||||
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
||||||
('A .hu.', ['A', '.hu', '.']),
|
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
@ -241,6 +243,24 @@ DOT_TESTS = [
|
||||||
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
('Valami ... más.', ['Valami', '...', 'más', '.'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
TYPO_TESTS = [
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége .Ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége !ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
(
|
||||||
|
'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('Ez egy mondat vége ?Ez egy másik eleje.',
|
||||||
|
['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']),
|
||||||
|
('egy,kettő', ['egy', ',', 'kettő']),
|
||||||
|
('egy ,kettő', ['egy', ',', 'kettő']),
|
||||||
|
('egy :kettő', ['egy', ':', 'kettő']),
|
||||||
|
]
|
||||||
|
|
||||||
WIKI_TESTS = [
|
WIKI_TESTS = [
|
||||||
('!"', ['!', '"']),
|
('!"', ['!', '"']),
|
||||||
('lány"a', ['lány', '"', 'a']),
|
('lány"a', ['lány', '"', 'a']),
|
||||||
|
@ -253,7 +273,7 @@ WIKI_TESTS = [
|
||||||
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
|
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid'])
|
||||||
]
|
]
|
||||||
|
|
||||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS
|
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||||
|
|
|
@ -19,6 +19,7 @@ def test_issue429(EN):
|
||||||
matcher = Matcher(EN.vocab)
|
matcher = Matcher(EN.vocab)
|
||||||
matcher.add('TEST', merge_phrases, [{'ORTH': 'a'}])
|
matcher.add('TEST', merge_phrases, [{'ORTH': 'a'}])
|
||||||
doc = EN.make_doc('a b c')
|
doc = EN.make_doc('a b c')
|
||||||
|
EN.tensorizer(doc)
|
||||||
EN.tagger(doc)
|
EN.tagger(doc)
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
EN.entity(doc)
|
EN.entity(doc)
|
||||||
|
|
|
@ -6,6 +6,7 @@ from ..util import get_doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue514(EN):
|
def test_issue514(EN):
|
||||||
"""Test serializing after adding entity"""
|
"""Test serializing after adding entity"""
|
||||||
|
|
|
@ -27,7 +27,6 @@ def test_issue615(en_tokenizer):
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add(label, merge_phrases, pattern)
|
matcher.add(label, merge_phrases, pattern)
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
print(match)
|
|
||||||
entities = list(doc.ents)
|
entities = list(doc.ents)
|
||||||
|
|
||||||
assert entities != [] #assertion 1
|
assert entities != [] #assertion 1
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue693(EN):
|
def test_issue693(EN):
|
||||||
"""Test that doc.noun_chunks parses the complete sentence."""
|
"""Test that doc.noun_chunks parses the complete sentence."""
|
||||||
|
@ -14,7 +15,5 @@ def test_issue693(EN):
|
||||||
doc2 = EN(text2)
|
doc2 = EN(text2)
|
||||||
chunks1 = [chunk for chunk in doc1.noun_chunks]
|
chunks1 = [chunk for chunk in doc1.noun_chunks]
|
||||||
chunks2 = [chunk for chunk in doc2.noun_chunks]
|
chunks2 = [chunk for chunk in doc2.noun_chunks]
|
||||||
for word in doc1:
|
|
||||||
print(word.text, word.dep_, word.head.text)
|
|
||||||
assert len(chunks1) == 2
|
assert len(chunks1) == 2
|
||||||
assert len(chunks2) == 2
|
assert len(chunks2) == 2
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue704(EN):
|
def test_issue704(EN):
|
||||||
"""Test that sentence boundaries are detected correctly."""
|
"""Test that sentence boundaries are detected correctly."""
|
||||||
|
|
|
@ -30,6 +30,7 @@ def fr_tokenizer_w_infix():
|
||||||
return French.Defaults.create_tokenizer()
|
return French.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]),
|
@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]),
|
||||||
("j'ai", ["j'", "ai"])])
|
("j'ai", ["j'", "ai"])])
|
||||||
def test_issue768(fr_tokenizer_w_infix, text, expected_tokens):
|
def test_issue768(fr_tokenizer_w_infix, text, expected_tokens):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...matcher import Matcher
|
from ...matcher import Matcher
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
import contextlib
|
import contextlib
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -9,7 +8,6 @@ import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
import pathlib
|
|
||||||
from ...gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ...pipeline import EntityRecognizer
|
from ...pipeline import EntityRecognizer
|
||||||
from ...lang.en import English
|
from ...lang.en import English
|
||||||
|
@ -57,19 +55,13 @@ def additional_entity_types():
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def temp_save_model(model):
|
def temp_save_model(model):
|
||||||
model_dir = Path(tempfile.mkdtemp())
|
model_dir = tempfile.mkdtemp()
|
||||||
# store the fine tuned model
|
model.to_disk(model_dir)
|
||||||
with (model_dir / "config.json").open('w') as file_:
|
|
||||||
data = json.dumps(model.cfg)
|
|
||||||
if not isinstance(data, unicode):
|
|
||||||
data = data.decode('utf8')
|
|
||||||
file_.write(data)
|
|
||||||
model.model.dump((model_dir / 'model').as_posix())
|
|
||||||
yield model_dir
|
yield model_dir
|
||||||
shutil.rmtree(model_dir.as_posix())
|
shutil.rmtree(model_dir.as_posix())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue910(EN, train_data, additional_entity_types):
|
def test_issue910(EN, train_data, additional_entity_types):
|
||||||
'''Test that adding entities and resuming training works passably OK.
|
'''Test that adding entities and resuming training works passably OK.
|
||||||
|
@ -79,24 +71,27 @@ def test_issue910(EN, train_data, additional_entity_types):
|
||||||
2) There's no way to set the learning rate for the weight update, so we
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
end up out-of-scale, causing it to learn too fast.
|
end up out-of-scale, causing it to learn too fast.
|
||||||
'''
|
'''
|
||||||
doc = EN(u"I am looking for a restaurant in Berlin")
|
nlp = EN
|
||||||
|
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||||
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
# Fine tune the ner model
|
# Fine tune the ner model
|
||||||
for entity_type in additional_entity_types:
|
for entity_type in additional_entity_types:
|
||||||
nlp.entity.add_label(entity_type)
|
nlp.entity.add_label(entity_type)
|
||||||
|
|
||||||
nlp.entity.model.learn_rate = 0.001
|
sgd = Adam(nlp.entity.model[0].ops, 0.001)
|
||||||
for itn in range(10):
|
for itn in range(10):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
|
nlp.tensorizer(doc)
|
||||||
gold = GoldParse(doc, entities=entity_offsets)
|
gold = GoldParse(doc, entities=entity_offsets)
|
||||||
loss = nlp.entity.update(doc, gold)
|
loss = nlp.entity.update(doc, gold, sgd=sgd, drop=0.5)
|
||||||
|
|
||||||
with temp_save_model(nlp.entity) as model_dir:
|
with temp_save_model(nlp.entity) as model_dir:
|
||||||
# Load the fine tuned model
|
# Load the fine tuned model
|
||||||
loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab)
|
loaded_ner = EntityRecognizer(nlp.vocab)
|
||||||
|
loaded_ner.from_disk(model_dir)
|
||||||
|
|
||||||
for raw_text, entity_offsets in train_data:
|
for raw_text, entity_offsets in train_data:
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
|
@ -104,6 +99,4 @@ def test_issue910(EN, train_data, additional_entity_types):
|
||||||
loaded_ner(doc)
|
loaded_ner(doc)
|
||||||
ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
|
ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
|
||||||
for start, end, label in entity_offsets:
|
for start, end, label in entity_offsets:
|
||||||
if (start, end) not in ents:
|
|
||||||
print(ents)
|
|
||||||
assert ents[(start, end)] == label
|
assert ents[(start, end)] == label
|
||||||
|
|
|
@ -4,13 +4,12 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('en')
|
@pytest.mark.models('en')
|
||||||
def test_issue955(EN, doc):
|
def test_issue955(EN):
|
||||||
'''Test that we don't have any nested noun chunks'''
|
'''Test that we don't have any nested noun chunks'''
|
||||||
doc = EN('Does flight number three fifty-four require a connecting flight'
|
doc = EN('Does flight number three fifty-four require a connecting flight'
|
||||||
' to get to Boston?')
|
' to get to Boston?')
|
||||||
seen_tokens = set()
|
seen_tokens = set()
|
||||||
for np in doc.noun_chunks:
|
for np in doc.noun_chunks:
|
||||||
print(np.text, np.root.text, np.root.dep_, np.root.tag_)
|
|
||||||
for word in np:
|
for word in np:
|
||||||
key = (word.i, word.text)
|
key = (word.i, word.text)
|
||||||
assert key not in seen_tokens
|
assert key not in seen_tokens
|
||||||
|
|
|
@ -63,7 +63,6 @@ def test_lexeme_bytes_roundtrip(en_vocab):
|
||||||
alpha = en_vocab['alpha']
|
alpha = en_vocab['alpha']
|
||||||
assert one.orth != alpha.orth
|
assert one.orth != alpha.orth
|
||||||
assert one.lower != alpha.lower
|
assert one.lower != alpha.lower
|
||||||
print(one.orth, alpha.orth)
|
|
||||||
alpha.from_bytes(one.to_bytes())
|
alpha.from_bytes(one.to_bytes())
|
||||||
|
|
||||||
assert one.orth_ == alpha.orth_
|
assert one.orth_ == alpha.orth_
|
||||||
|
|
|
@ -26,7 +26,6 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..attrs cimport SENT_START
|
from ..attrs cimport SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..syntax.iterators import CHUNKERS
|
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config
|
from ..compat import is_config
|
||||||
from .. import about
|
from .. import about
|
||||||
|
@ -65,6 +64,14 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
else:
|
else:
|
||||||
return Lexeme.get_struct_attr(token.lex, feat_name)
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||||
|
|
||||||
|
def _get_chunker(lang):
|
||||||
|
try:
|
||||||
|
cls = util.get_lang_class(lang)
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
return cls.Defaults.syntax_iterators.get(u'noun_chunks')
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
"""A sequence of Token objects. Access sentences and named entities, export
|
"""A sequence of Token objects. Access sentences and named entities, export
|
||||||
|
@ -117,7 +124,7 @@ cdef class Doc:
|
||||||
self.user_data = {}
|
self.user_data = {}
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = CHUNKERS.get(self.vocab.lang)
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||||
cdef unicode orth
|
cdef unicode orth
|
||||||
cdef bint has_space
|
cdef bint has_space
|
||||||
if orths_and_spaces is None and words is not None:
|
if orths_and_spaces is None and words is not None:
|
||||||
|
@ -255,7 +262,7 @@ cdef class Doc:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
elif any(token.has_vector for token in self):
|
elif any(token.has_vector for token in self):
|
||||||
return True
|
return True
|
||||||
elif self.tensor:
|
elif self.tensor is not None:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
@ -275,7 +282,7 @@ cdef class Doc:
|
||||||
elif self.has_vector and len(self):
|
elif self.has_vector and len(self):
|
||||||
self._vector = sum(t.vector for t in self) / len(self)
|
self._vector = sum(t.vector for t in self) / len(self)
|
||||||
return self._vector
|
return self._vector
|
||||||
elif self.tensor:
|
elif self.tensor is not None:
|
||||||
self._vector = self.tensor.mean(axis=0)
|
self._vector = self.tensor.mean(axis=0)
|
||||||
return self._vector
|
return self._vector
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -234,7 +234,7 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
return self.vocab.has_vector(self.lex.c.orth)
|
return self.vocab.has_vector(self.c.lex.orth)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
|
|
@ -155,7 +155,7 @@ def get_model_meta(path):
|
||||||
meta = read_json(meta_path)
|
meta = read_json(meta_path)
|
||||||
for setting in ['lang', 'name', 'version']:
|
for setting in ['lang', 'name', 'version']:
|
||||||
if setting not in meta:
|
if setting not in meta:
|
||||||
raise IOError('No %s setting found in model meta.json' % setting)
|
raise ValueError('No %s setting found in model meta.json' % setting)
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -417,6 +417,7 @@ def read_json(location):
|
||||||
location (Path): Path to JSON file.
|
location (Path): Path to JSON file.
|
||||||
RETURNS (dict): Loaded JSON content.
|
RETURNS (dict): Loaded JSON content.
|
||||||
"""
|
"""
|
||||||
|
location = ensure_path(location)
|
||||||
with location.open('r', encoding='utf8') as f:
|
with location.open('r', encoding='utf8') as f:
|
||||||
return ujson.load(f)
|
return ujson.load(f)
|
||||||
|
|
||||||
|
@ -477,7 +478,7 @@ def print_table(data, title=None):
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
tpl_row = ' {:<15}' * len(data[0])
|
tpl_row = ' {:<15}' * len(data[0])
|
||||||
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
table = '\n'.join([tpl_row.format(l, unicode_(v)) for l, v in data])
|
||||||
if title:
|
if title:
|
||||||
print('\n \033[93m{}\033[0m'.format(title))
|
print('\n \033[93m{}\033[0m'.format(title))
|
||||||
print('\n{}\n'.format(table))
|
print('\n{}\n'.format(table))
|
||||||
|
@ -490,11 +491,12 @@ def print_markdown(data, title=None):
|
||||||
title (unicode or None): Title, will be rendered as headline 2.
|
title (unicode or None): Title, will be rendered as headline 2.
|
||||||
"""
|
"""
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
return Path(value).exists() # contains path (personal info)
|
# contains path, i.e. personal info
|
||||||
|
return isinstance(value, basestring_) and Path(value).exists()
|
||||||
|
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
|
markdown = ["* **{}:** {}".format(l, unicode_(v)) for l, v in data if not excl_value(v)]
|
||||||
if title:
|
if title:
|
||||||
print("\n## {}".format(title))
|
print("\n## {}".format(title))
|
||||||
print('\n{}\n'.format('\n'.join(markdown)))
|
print('\n{}\n'.format('\n'.join(markdown)))
|
||||||
|
|
|
@ -278,7 +278,7 @@ cdef class Vocab:
|
||||||
"""Check whether a word has a vector. Returns False if no
|
"""Check whether a word has a vector. Returns False if no
|
||||||
vectors have been loaded. Words can be looked up by string
|
vectors have been loaded. Words can be looked up by string
|
||||||
or int ID."""
|
or int ID."""
|
||||||
raise NotImplementedError
|
return False
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
|
@ -28,8 +28,8 @@
|
||||||
|
|
||||||
- function getSocialImg() {
|
- function getSocialImg() {
|
||||||
- var base = SITE_URL + '/assets/img/social/preview_'
|
- var base = SITE_URL + '/assets/img/social/preview_'
|
||||||
- var image = 'default'
|
- var image = ALPHA ? 'alpha' : 'default'
|
||||||
- if (preview) image = preview
|
- if (preview) image = preview
|
||||||
- else if (SECTION == 'docs') image = 'docs'
|
- else if (SECTION == 'docs' && !ALPHA) image = 'docs'
|
||||||
- return base + image + '.jpg'
|
- return base + image + '.jpg'
|
||||||
- }
|
- }
|
||||||
|
|
|
@ -22,12 +22,12 @@ main.o-main.o-main--sidebar.o-main--aside
|
||||||
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
+infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs")
|
||||||
strong This page is part of the alpha documentation for spaCy v2.0.
|
strong This page is part of the alpha documentation for spaCy v2.0.
|
||||||
| It does not reflect the state of the latest stable release.
|
| It does not reflect the state of the latest stable release.
|
||||||
| Because v2.0 is still under development, the actual
|
| Because v2.0 is still under development, the implementation
|
||||||
| implementation may differ from the intended state described
|
| may differ from the intended state described here. See the
|
||||||
| here.
|
| #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes]
|
||||||
| #[+a("#") See here] for more information on how to install
|
| for details on how to install and test the new version. To
|
||||||
| and test the new version. To read the official docs for
|
| read the official docs for spaCy v1.x,
|
||||||
| v1.x, #[+a("https://spacy.io/docs") go here].
|
| #[+a("https://spacy.io/docs") go here].
|
||||||
|
|
||||||
!=yield
|
!=yield
|
||||||
|
|
||||||
|
|
BIN
website/assets/img/social/preview_alpha.jpg
Normal file
BIN
website/assets/img/social/preview_alpha.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 374 KiB |
|
@ -209,8 +209,8 @@ p
|
||||||
+cell Number of sentences (default: #[code 0]).
|
+cell Number of sentences (default: #[code 0]).
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --use-gpu], #[code -G]
|
+cell #[code --use-gpu], #[code -g]
|
||||||
+cell flag
|
+cell option
|
||||||
+cell Use GPU.
|
+cell Use GPU.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
|
|
@ -78,6 +78,14 @@ p
|
||||||
| #[code like_num], which includes language-specific words like "ten"
|
| #[code like_num], which includes language-specific words like "ten"
|
||||||
| or "hundred".
|
| or "hundred".
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[strong Syntax iterators]
|
||||||
|
| #[+src(gh("spaCy", "spacy/lang/en/syntax_iterators.py")) syntax_iterators.py]
|
||||||
|
+cell
|
||||||
|
| Functions that compute views of a #[code Doc] object based on its
|
||||||
|
| syntax. At the moment, only used for
|
||||||
|
| #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[strong Lemmatizer]
|
+cell #[strong Lemmatizer]
|
||||||
| #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
|
| #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
|
||||||
|
|
|
@ -42,6 +42,7 @@ p
|
||||||
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
|
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
|
||||||
+item #[+a("#norm-exceptions") Norm exceptions]
|
+item #[+a("#norm-exceptions") Norm exceptions]
|
||||||
+item #[+a("#lex-attrs") Lexical attributes]
|
+item #[+a("#lex-attrs") Lexical attributes]
|
||||||
|
+item #[+a("#syntax-iterators") Syntax iterators]
|
||||||
+item #[+a("#lemmatizer") Lemmatizer]
|
+item #[+a("#lemmatizer") Lemmatizer]
|
||||||
+item #[+a("#tag-map") Tag map]
|
+item #[+a("#tag-map") Tag map]
|
||||||
+item #[+a("#morph-rules") Morph rules]
|
+item #[+a("#morph-rules") Morph rules]
|
||||||
|
@ -104,6 +105,13 @@ p
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell Attribute ID mapped to function.
|
+cell Attribute ID mapped to function.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code SYNTAX_ITERATORS]
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Iterator ID mapped to function. Currently only supports
|
||||||
|
| #[code 'noun_chunks'].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code LOOKUP]
|
+cell #[code LOOKUP]
|
||||||
+cell dict
|
+cell dict
|
||||||
|
@ -341,9 +349,12 @@ p
|
||||||
| a token's norm equals its lowercase text. If the lowercase spelling of a
|
| a token's norm equals its lowercase text. If the lowercase spelling of a
|
||||||
| word exists, norms should always be in lowercase.
|
| word exists, norms should always be in lowercase.
|
||||||
|
|
||||||
+aside-code("Accessing norms").
|
+aside-code("Norms vs. lemmas").
|
||||||
doc = nlp(u"I can't")
|
doc = nlp(u"I'm gonna realise")
|
||||||
assert [t.norm_ for t in doc] == ['i', 'can', 'not']
|
norms = [token.norm_ for token in doc]
|
||||||
|
lemmas = [token.lemma_ for token in doc]
|
||||||
|
assert norms == ['i', 'am', 'going', 'to', 'realize']
|
||||||
|
assert lemmas == ['i', 'be', 'go', 'to', 'realise']
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy usually tries to normalise words with different spellings to a single,
|
| spaCy usually tries to normalise words with different spellings to a single,
|
||||||
|
@ -449,6 +460,33 @@ p
|
||||||
| #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
|
| #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
|
||||||
| are overwritten.
|
| are overwritten.
|
||||||
|
|
||||||
|
+h(3, "syntax-iterators") Syntax iterators
|
||||||
|
|
||||||
|
p
|
||||||
|
| Syntax iterators are functions that compute views of a #[code Doc]
|
||||||
|
| object based on its syntax. At the moment, this data is only used for
|
||||||
|
| extracting
|
||||||
|
| #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
|
||||||
|
| are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
|
||||||
|
| property. Because base noun phrases work differently across languages,
|
||||||
|
| the rules to compute them are part of the individual language's data. If
|
||||||
|
| a language does not include a noun chunks iterator, the property won't
|
||||||
|
| be available. For examples, see the existing syntax iterators:
|
||||||
|
|
||||||
|
+aside-code("Noun chunks example").
|
||||||
|
doc = nlp(u'A phrase with another phrase occurs.')
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert chunks[0].text == "A phrase"
|
||||||
|
assert chunks[1].text == "another phrase"
|
||||||
|
|
||||||
|
+table(["Language", "Source"])
|
||||||
|
for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
|
||||||
|
+row
|
||||||
|
+cell=lang
|
||||||
|
+cell
|
||||||
|
+src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
|
||||||
|
| lang/#{lang_id}/syntax_iterators.py
|
||||||
|
|
||||||
+h(3, "lemmatizer") Lemmatizer
|
+h(3, "lemmatizer") Lemmatizer
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -604,6 +642,8 @@ p
|
||||||
|
|
||||||
+h(2, "vocabulary") Building the vocabulary
|
+h(2, "vocabulary") Building the vocabulary
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy expects that common words will be cached in a
|
| spaCy expects that common words will be cached in a
|
||||||
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
|
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
|
||||||
|
@ -697,6 +737,8 @@ p
|
||||||
|
|
||||||
+h(3, "word-vectors") Training the word vectors
|
+h(3, "word-vectors") Training the word vectors
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
|
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
|
||||||
| algorithms let you train useful word similarity models from unlabelled
|
| algorithms let you train useful word similarity models from unlabelled
|
||||||
|
@ -731,6 +773,8 @@ p
|
||||||
|
|
||||||
+h(2, "train-tagger-parser") Training the tagger and parser
|
+h(2, "train-tagger-parser") Training the tagger and parser
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can now train the model using a corpus for your language annotated
|
| You can now train the model using a corpus for your language annotated
|
||||||
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
|
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
|
||||||
|
|
|
@ -104,6 +104,13 @@ p
|
||||||
| recommend using pip with a direct link, instead of relying on spaCy's
|
| recommend using pip with a direct link, instead of relying on spaCy's
|
||||||
| #[+api("cli#download") #[code download]] command.
|
| #[+api("cli#download") #[code download]] command.
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| You can also add the direct download link to your application's
|
||||||
|
| #[code requirements.txt]. For more details,
|
||||||
|
| see the usage guide on
|
||||||
|
| #[+a("/docs/usage/production-use#models") working with models in production].
|
||||||
|
|
||||||
|
|
||||||
+h(3, "download-manual") Manual download and installation
|
+h(3, "download-manual") Manual download and installation
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -118,15 +125,15 @@ p
|
||||||
└── en_core_web_md-1.2.0.tar.gz # downloaded archive
|
└── en_core_web_md-1.2.0.tar.gz # downloaded archive
|
||||||
├── meta.json # model meta data
|
├── meta.json # model meta data
|
||||||
├── setup.py # setup file for pip installation
|
├── setup.py # setup file for pip installation
|
||||||
└── en_core_web_md # model directory
|
└── en_core_web_md # 📦 model package
|
||||||
├── __init__.py # init for pip installation
|
├── __init__.py # init for pip installation
|
||||||
├── meta.json # model meta data
|
├── meta.json # model meta data
|
||||||
└── en_core_web_md-1.2.0 # model data
|
└── en_core_web_md-1.2.0 # model data
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can place the model data directory anywhere on your local file system.
|
| You can place the #[strong model package directory] anywhere on your
|
||||||
| To use it with spaCy, simply assign it a name by creating a
|
| local file system. To use it with spaCy, simply assign it a name by
|
||||||
| #[+a("#usage") shortcut link] for the data directory.
|
| creating a #[+a("#usage") shortcut link] for the data directory.
|
||||||
|
|
||||||
+h(2, "usage") Using models with spaCy
|
+h(2, "usage") Using models with spaCy
|
||||||
|
|
||||||
|
@ -136,9 +143,9 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
import spacy
|
||||||
nlp = spacy.load('en') # load model with shortcut link "en"
|
nlp = spacy.load('en') # load model with shortcut link "en"
|
||||||
nlp = spacy.load('en_core_web_sm') # load model package "en_core_web_sm"
|
nlp = spacy.load('en_core_web_sm') # load model package "en_core_web_sm"
|
||||||
nlp = spacy.load('/path/to/model') # load model from a directory
|
nlp = spacy.load('/path/to/en_core_web_sm') # load package from a directory
|
||||||
|
|
||||||
doc = nlp(u'This is a sentence.')
|
doc = nlp(u'This is a sentence.')
|
||||||
|
|
||||||
|
@ -219,6 +226,10 @@ p
|
||||||
| immediately, instead of failing somewhere down the line when calling
|
| immediately, instead of failing somewhere down the line when calling
|
||||||
| #[code spacy.load()].
|
| #[code spacy.load()].
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| For more details, see the usage guide on
|
||||||
|
| #[+a("/docs/usage/production-use#models") working with models in production].
|
||||||
|
|
||||||
+h(2, "own-models") Using your own models
|
+h(2, "own-models") Using your own models
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -76,3 +76,72 @@ p
|
||||||
| attributes to set the part-of-speech tags, syntactic dependencies, named
|
| attributes to set the part-of-speech tags, syntactic dependencies, named
|
||||||
| entities and other attributes. For details, see the respective usage
|
| entities and other attributes. For details, see the respective usage
|
||||||
| pages.
|
| pages.
|
||||||
|
|
||||||
|
+h(2, "models") Working with models
|
||||||
|
|
||||||
|
p
|
||||||
|
| If your application depends on one or more #[+a("/docs/usage/models") models],
|
||||||
|
| you'll usually want to integrate them into your continuous integration
|
||||||
|
| workflow and build process. While spaCy provides a range of useful helpers
|
||||||
|
| for downloading, linking and loading models, the underlying functionality
|
||||||
|
| is entirely based on native Python packages. This allows your application
|
||||||
|
| to handle a model like any other package dependency.
|
||||||
|
|
||||||
|
+h(3, "models-download") Downloading and requiring model dependencies
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy's built-in #[+api("cli#download") #[code download]] command
|
||||||
|
| is mostly intended as a convenient, interactive wrapper. It performs
|
||||||
|
| compatibility checks and prints detailed error messages and warnings.
|
||||||
|
| However, if you're downloading models as part of an automated build
|
||||||
|
| process, this only adds an unecessary layer of complexity. If you know
|
||||||
|
| which models your application needs, you should be specifying them directly.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Because all models are valid Python packages, you can add them to your
|
||||||
|
| application's #[code requirements.txt]. If you're running your own
|
||||||
|
| internal PyPi installation, you can simply upload the models there. pip's
|
||||||
|
| #[+a("https://pip.pypa.io/en/latest/reference/pip_install/#requirements-file-format") requirements file format]
|
||||||
|
| supports both package names to download via a PyPi server, as well as direct
|
||||||
|
| URLs.
|
||||||
|
|
||||||
|
+code("requirements.txt", "text").
|
||||||
|
spacy>=2.0.0,<3.0.0
|
||||||
|
-e #{gh("spacy-models")}/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
|
||||||
|
|
||||||
|
p
|
||||||
|
| All models are versioned and specify their spaCy dependency. This ensures
|
||||||
|
| cross-compatibility and lets you specify exact version requirements for
|
||||||
|
| each model. If you've trained your own model, you can use the
|
||||||
|
| #[+api("cli#package") #[code package]] command to generate the required
|
||||||
|
| meta data and turn it into a loadable package.
|
||||||
|
|
||||||
|
+h(3, "models-loading") Loading and testing models
|
||||||
|
|
||||||
|
p
|
||||||
|
| Downloading models directly via pip won't call spaCy's link
|
||||||
|
| #[+api("cli#link") #[code link]] command, which creates
|
||||||
|
| symlinks for model shortcuts. This means that you'll have to run this
|
||||||
|
| command separately, or use the native #[code import] syntax to load the
|
||||||
|
| models:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
import en_core_web_sm
|
||||||
|
nlp = en_core_web_sm.load()
|
||||||
|
|
||||||
|
p
|
||||||
|
| In general, this approach is recommended for larger code bases, as it's
|
||||||
|
| more "native", and doesn't depend on symlinks or rely on spaCy's loader
|
||||||
|
| to resolve string names to model packages. If a model can't be
|
||||||
|
| imported, Python will raise an #[code ImportError] immediately. And if a
|
||||||
|
| model is imported but not used, any linter will catch that.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Similarly, it'll give you more flexibility when writing tests that
|
||||||
|
| require loading models. For example, instead of writing your own
|
||||||
|
| #[code try] and #[code except] logic around spaCy's loader, you can use
|
||||||
|
| #[+a("http://pytest.readthedocs.io/en/latest/") pytest]'s
|
||||||
|
| #[code importorskip()] method to only run a test if a specific model or
|
||||||
|
| model version is installed. Each model package exposes a #[code __version__]
|
||||||
|
| attribute which you can also use to perform your own version compatibility
|
||||||
|
| checks before loading a model.
|
||||||
|
|
|
@ -29,6 +29,7 @@ p
|
||||||
| standards.
|
| standards.
|
||||||
|
|
||||||
+h(2, "getting-started") Getting started
|
+h(2, "getting-started") Getting started
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
p
|
p
|
||||||
| The quickest way visualize #[code Doc] is to use
|
| The quickest way visualize #[code Doc] is to use
|
||||||
|
|
Loading…
Reference in New Issue
Block a user