Tidy up and fix issues

This commit is contained in:
Ines Montani 2020-02-18 15:17:03 +01:00
parent de11ea753a
commit 1278161f47
9 changed files with 1476 additions and 1500 deletions

View File

@ -235,7 +235,7 @@ def example_from_conllu_sentence(
subtok_word = "" subtok_word = ""
in_subtok = False in_subtok = False
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head != "0" else id_ head = (int(head) - 1) if head not in ("0", "_") else id_
tag = pos if tag == "_" else tag tag = pos if tag == "_" else tag
morph = morph if morph != "_" else "" morph = morph if morph != "_" else ""
dep = "ROOT" if dep == "root" else dep dep = "ROOT" if dep == "root" else dep

View File

@ -541,8 +541,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. " E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes " "This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.") "'{token_attrs}'.")
E998 = ("Can only create GoldParse's from Example's without a Doc, " E998 = ("Can only create GoldParse objects from Example objects without a "
"if get_gold_parses() is called with a Vocab object.") "Doc if get_gold_parses() is called with a Vocab object.")
E999 = ("Encountered an unexpected format for the dictionary holding " E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}") "gold annotations: {gold_dict}")

View File

@ -991,11 +991,6 @@ cdef class GoldParse:
self.cats = {} if cats is None else dict(cats) self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links) self.links = {} if links is None else dict(links)
# orig_annot is used as an iterator in `nlp.evalate` even if self.length == 0,
# so set a empty list to avoid error.
# if self.lenght > 0, this is modified latter.
self.orig_annot = []
# avoid allocating memory if the doc does not contain any tokens # avoid allocating memory if the doc does not contain any tokens
if self.length > 0: if self.length > 0:
if not words: if not words:

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [

View File

@ -1,8 +1,5 @@
# coding: utf8 from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB
from __future__ import unicode_literals from ...symbols import NOUN, PART, INTJ, PRON
from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html # Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
# fmt: off # fmt: off

View File

@ -77,7 +77,7 @@ cdef class Parser:
tok2vec = Tok2Vec(width=token_vector_width, tok2vec = Tok2Vec(width=token_vector_width,
embed_size=embed_size, embed_size=embed_size,
conv_depth=conv_depth, conv_depth=conv_depth,
window_size=window_size, window_size=conv_window,
cnn_maxout_pieces=t2v_pieces, cnn_maxout_pieces=t2v_pieces,
subword_features=subword_features, subword_features=subword_features,
pretrained_vectors=pretrained_vectors, pretrained_vectors=pretrained_vectors,
@ -105,7 +105,7 @@ cdef class Parser:
'bilstm_depth': bilstm_depth, 'bilstm_depth': bilstm_depth,
'self_attn_depth': self_attn_depth, 'self_attn_depth': self_attn_depth,
'conv_depth': conv_depth, 'conv_depth': conv_depth,
'window_size': window_size, 'window_size': conv_window,
'embed_size': embed_size, 'embed_size': embed_size,
'cnn_maxout_pieces': t2v_pieces 'cnn_maxout_pieces': t2v_pieces
} }

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
@ -9,11 +6,12 @@ def test_issue4849():
nlp = English() nlp = English()
ruler = EntityRuler( ruler = EntityRuler(
nlp, patterns=[ nlp,
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'}, patterns=[
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'}, {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
], ],
phrase_matcher_attr="LOWER" phrase_matcher_attr="LOWER",
) )
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
@ -27,10 +25,10 @@ def test_issue4849():
count_ents = 0 count_ents = 0
for doc in nlp.pipe([text], n_process=1): for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2) assert count_ents == 2
# USING 2 PROCESSES # USING 2 PROCESSES
count_ents = 0 count_ents = 0
for doc in nlp.pipe([text], n_process=2): for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2) assert count_ents == 2

View File

@ -1,16 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
import pytest import pytest
from spacy.language import Language
import spacy
@pytest.fixture def test_evaluate():
def nlp(): nlp = Language()
return spacy.blank("en")
def test_evaluate(nlp):
docs_golds = [("", {})] docs_golds = [("", {})]
with pytest.raises(ValueError):
nlp.evaluate(docs_golds) nlp.evaluate(docs_golds)