Tidy up and fix issues

This commit is contained in:
Ines Montani 2020-02-18 15:17:03 +01:00
parent de11ea753a
commit 1278161f47
9 changed files with 1476 additions and 1500 deletions

View File

@ -235,7 +235,7 @@ def example_from_conllu_sentence(
subtok_word = ""
in_subtok = False
id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_
head = (int(head) - 1) if head not in ("0", "_") else id_
tag = pos if tag == "_" else tag
morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep

View File

@ -541,8 +541,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
E998 = ("Can only create GoldParse's from Example's without a Doc, "
"if get_gold_parses() is called with a Vocab object.")
E998 = ("Can only create GoldParse objects from Example objects without a "
"Doc if get_gold_parses() is called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")

View File

@ -991,11 +991,6 @@ cdef class GoldParse:
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
# orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0,
# so set a empty list to avoid error.
# if self.lenght > 0, this is modified latter.
self.orig_annot = []
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if not words:

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [

File diff suppressed because it is too large Load Diff

View File

@ -77,7 +77,7 @@ cdef class Parser:
tok2vec = Tok2Vec(width=token_vector_width,
embed_size=embed_size,
conv_depth=conv_depth,
window_size=window_size,
window_size=conv_window,
cnn_maxout_pieces=t2v_pieces,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors,
@ -105,7 +105,7 @@ cdef class Parser:
'bilstm_depth': bilstm_depth,
'self_attn_depth': self_attn_depth,
'conv_depth': conv_depth,
'window_size': window_size,
'window_size': conv_window,
'embed_size': embed_size,
'cnn_maxout_pieces': t2v_pieces
}

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
@ -9,11 +6,12 @@ def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp, patterns=[
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
nlp,
patterns=[
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
],
phrase_matcher_attr="LOWER"
phrase_matcher_attr="LOWER",
)
nlp.add_pipe(ruler)
@ -27,10 +25,10 @@ def test_issue4849():
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2)
assert count_ents == 2
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2)
assert count_ents == 2

View File

@ -1,16 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
import spacy
from spacy.language import Language
@pytest.fixture
def nlp():
return spacy.blank("en")
def test_evaluate(nlp):
def test_evaluate():
nlp = Language()
docs_golds = [("", {})]
nlp.evaluate(docs_golds)
with pytest.raises(ValueError):
nlp.evaluate(docs_golds)