mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
cleanup
This commit is contained in:
parent
86a4e316b8
commit
ed53bb979d
|
@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
|
|||
from ...strings cimport hash_string
|
||||
from ...structs cimport TokenC
|
||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||
from ...tokens.token import MISSING_DEP_
|
||||
from ...tokens.token cimport MISSING_DEP
|
||||
from ...training.example cimport Example
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, ArcC
|
||||
|
@ -196,8 +196,7 @@ cdef class ArcEagerGold:
|
|||
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
||||
self.mem = Pool()
|
||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||
labels = [label if label is not None else MISSING_DEP_ for label in labels]
|
||||
labels = [example.x.vocab.strings.add(label) for label in labels]
|
||||
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
||||
sent_starts = example.get_aligned_sent_starts()
|
||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||
|
|
|
@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
|
|||
from spacy.symbols import VERB
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens.token import MISSING_DEP_
|
||||
from spacy.training import Example
|
||||
|
||||
|
||||
|
@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):
|
|||
|
||||
|
||||
def test_missing_head_dep(en_vocab):
|
||||
heads = [1, 1, 1, 1, 2, None]
|
||||
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
|
||||
""" Check that the Doc constructor and Example.from_dict parse missing information the same"""
|
||||
heads = [1, 1, 1, 1, 2, None] # element 5 is missing
|
||||
deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing
|
||||
words = ["I", "like", "London", "and", "Berlin", "."]
|
||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
pred_has_heads = [t.has_head() for t in doc]
|
||||
pred_has_deps = [t.has_dep() for t in doc]
|
||||
pred_heads = [t.head.i for t in doc]
|
||||
pred_deps = [t.dep_ for t in doc]
|
||||
pred_sent_starts = [t.is_sent_start for t in doc]
|
||||
assert pred_has_heads == [True, True, True, True, True, False]
|
||||
assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
|
||||
assert pred_has_heads == [False, True, True, True, True, False]
|
||||
assert pred_has_deps == [False, True, True, True, True, False]
|
||||
assert pred_heads[1:5] == [1, 1, 1, 2]
|
||||
assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
|
||||
assert pred_sent_starts == [True, False, False, False, False, False]
|
||||
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||
ref_has_heads = [t.has_head() for t in example.reference]
|
||||
ref_has_deps = [t.has_dep() for t in example.reference]
|
||||
ref_heads = [t.head.i for t in example.reference]
|
||||
ref_deps = [t.dep_ for t in example.reference]
|
||||
ref_has_heads = [t.has_head() for t in example.reference]
|
||||
ref_sent_starts = [t.is_sent_start for t in example.reference]
|
||||
assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
|
||||
assert ref_has_heads == [True, True, True, True, True, False]
|
||||
assert ref_sent_starts == [True, False, False, False, False, False]
|
||||
assert ref_has_heads == pred_has_heads
|
||||
assert ref_has_deps == pred_has_heads
|
||||
assert ref_heads == pred_heads
|
||||
assert ref_deps == pred_deps
|
||||
assert ref_sent_starts == pred_sent_starts
|
||||
# check that the aligned parse preserves the missing information
|
||||
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
|
||||
assert aligned_deps[0] == ref_deps[0]
|
||||
assert aligned_heads[0] == ref_heads[0]
|
||||
assert aligned_deps[5] == ref_deps[5]
|
||||
assert aligned_heads[5] == ref_heads[5]
|
||||
assert aligned_deps[5] == MISSING_DEP_
|
||||
|
|
|
@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
|
|||
parser.add_label(dep)
|
||||
optimizer = nlp.initialize()
|
||||
# run overfitting
|
||||
for i in range(150):
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses[pipe_name] < 0.0001
|
||||
|
|
|
@ -16,7 +16,7 @@ from thinc.util import copy_array
|
|||
import warnings
|
||||
|
||||
from .span cimport Span
|
||||
from .token import MISSING_DEP_
|
||||
from .token cimport MISSING_DEP
|
||||
from .token cimport Token
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
|
@ -269,6 +269,7 @@ cdef class Doc:
|
|||
if heads is not None:
|
||||
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
|
||||
if deps is not None:
|
||||
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
|
||||
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
||||
if deps and not heads:
|
||||
heads = [0] * len(deps)
|
||||
|
|
|
@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
|
|||
|
||||
from ..errors import Errors
|
||||
|
||||
cdef int MISSING_DEP = 0
|
||||
|
||||
cdef class Token:
|
||||
cdef readonly Vocab vocab
|
||||
|
@ -95,9 +96,12 @@ cdef class Token:
|
|||
elif feat_name == SENT_START:
|
||||
token.sent_start = value
|
||||
|
||||
|
||||
@staticmethod
|
||||
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||
return token.dep == MISSING_DEP
|
||||
|
||||
|
||||
@staticmethod
|
||||
cdef inline int missing_head(const TokenC* token) nogil:
|
||||
if token.dep == 0:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
return Token.missing_dep(token)
|
||||
|
|
|
@ -22,8 +22,6 @@ from .. import parts_of_speech
|
|||
from ..errors import Errors, Warnings
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
MISSING_DEP_ = ""
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||
|
@ -640,14 +638,13 @@ cdef class Token:
|
|||
return False
|
||||
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
||||
|
||||
|
||||
def has_head(self):
|
||||
"""Check whether the token has annotated head information.
|
||||
Return False when the head annotation is unset/missing.
|
||||
|
||||
RETURNS (bool): Whether the head annotation is valid or not.
|
||||
"""
|
||||
return self.dep_ != MISSING_DEP_
|
||||
|
||||
return not Token.missing_head(self.c)
|
||||
|
||||
property head:
|
||||
"""The syntactic parent, or "governor", of this token.
|
||||
|
@ -873,6 +870,14 @@ cdef class Token:
|
|||
def __set__(self, tag):
|
||||
self.tag = self.vocab.strings.add(tag)
|
||||
|
||||
def has_dep(self):
|
||||
"""Check whether the token has annotated dep information.
|
||||
Returns False when the dep label is unset/missing.
|
||||
|
||||
RETURNS (bool): Whether the dep label is valid or not.
|
||||
"""
|
||||
return not Token.missing_dep(self.c)
|
||||
|
||||
property dep_:
|
||||
"""RETURNS (str): The syntactic dependency label."""
|
||||
def __get__(self):
|
||||
|
|
|
@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
|||
from .iob_utils import biluo_tags_to_spans
|
||||
from ..errors import Errors, Warnings
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..tokens.token import MISSING_DEP_
|
||||
from ..tokens.token cimport MISSING_DEP
|
||||
from ..util import logger
|
||||
|
||||
|
||||
|
@ -180,14 +180,15 @@ cdef class Example:
|
|||
gold_to_cand = self.alignment.y2x
|
||||
aligned_heads = [None] * self.x.length
|
||||
aligned_deps = [None] * self.x.length
|
||||
has_deps = [token.has_dep() for token in self.y]
|
||||
has_heads = [token.has_head() for token in self.y]
|
||||
heads = [token.head.i for token in self.y]
|
||||
deps = [token.dep_ for token in self.y]
|
||||
if projectivize:
|
||||
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
||||
# ensure that data that was previously missing, remains missing
|
||||
# ensure that missing data remains missing
|
||||
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
||||
deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)]
|
||||
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
||||
for cand_i in range(self.x.length):
|
||||
if cand_to_gold.lengths[cand_i] == 1:
|
||||
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||
|
@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||
elif key == "DEP":
|
||||
attrs.append(key)
|
||||
value = [v if v is not None else MISSING_DEP_ for v in value]
|
||||
values.append([vocab.strings.add(h) for h in value])
|
||||
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append(value)
|
||||
|
|
Loading…
Reference in New Issue
Block a user