mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
cleanup
This commit is contained in:
parent
86a4e316b8
commit
ed53bb979d
|
@ -9,7 +9,7 @@ from ...typedefs cimport hash_t, attr_t
|
||||||
from ...strings cimport hash_string
|
from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ...tokens.token import MISSING_DEP_
|
from ...tokens.token cimport MISSING_DEP
|
||||||
from ...training.example cimport Example
|
from ...training.example cimport Example
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC, ArcC
|
from ._state cimport StateC, ArcC
|
||||||
|
@ -196,8 +196,7 @@ cdef class ArcEagerGold:
|
||||||
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||||
labels = [label if label is not None else MISSING_DEP_ for label in labels]
|
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
||||||
labels = [example.x.vocab.strings.add(label) for label in labels]
|
|
||||||
sent_starts = example.get_aligned_sent_starts()
|
sent_starts = example.get_aligned_sent_starts()
|
||||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||||
|
|
|
@ -4,7 +4,6 @@ from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STO
|
||||||
from spacy.symbols import VERB
|
from spacy.symbols import VERB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.tokens.token import MISSING_DEP_
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
|
||||||
|
|
||||||
|
@ -255,24 +254,35 @@ def test_token_api_non_conjuncts(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_missing_head_dep(en_vocab):
|
def test_missing_head_dep(en_vocab):
|
||||||
heads = [1, 1, 1, 1, 2, None]
|
""" Check that the Doc constructor and Example.from_dict parse missing information the same"""
|
||||||
deps = ["nsubj", "ROOT", "dobj", "cc", "conj", None]
|
heads = [1, 1, 1, 1, 2, None] # element 5 is missing
|
||||||
|
deps = ["", "ROOT", "dobj", "cc", "conj", None] # element 0 and 5 are missing
|
||||||
words = ["I", "like", "London", "and", "Berlin", "."]
|
words = ["I", "like", "London", "and", "Berlin", "."]
|
||||||
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
pred_has_heads = [t.has_head() for t in doc]
|
pred_has_heads = [t.has_head() for t in doc]
|
||||||
|
pred_has_deps = [t.has_dep() for t in doc]
|
||||||
|
pred_heads = [t.head.i for t in doc]
|
||||||
pred_deps = [t.dep_ for t in doc]
|
pred_deps = [t.dep_ for t in doc]
|
||||||
pred_sent_starts = [t.is_sent_start for t in doc]
|
pred_sent_starts = [t.is_sent_start for t in doc]
|
||||||
assert pred_has_heads == [True, True, True, True, True, False]
|
assert pred_has_heads == [False, True, True, True, True, False]
|
||||||
assert pred_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
|
assert pred_has_deps == [False, True, True, True, True, False]
|
||||||
|
assert pred_heads[1:5] == [1, 1, 1, 2]
|
||||||
|
assert pred_deps[1:5] == ["ROOT", "dobj", "cc", "conj"]
|
||||||
assert pred_sent_starts == [True, False, False, False, False, False]
|
assert pred_sent_starts == [True, False, False, False, False, False]
|
||||||
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||||
|
ref_has_heads = [t.has_head() for t in example.reference]
|
||||||
|
ref_has_deps = [t.has_dep() for t in example.reference]
|
||||||
ref_heads = [t.head.i for t in example.reference]
|
ref_heads = [t.head.i for t in example.reference]
|
||||||
ref_deps = [t.dep_ for t in example.reference]
|
ref_deps = [t.dep_ for t in example.reference]
|
||||||
ref_has_heads = [t.has_head() for t in example.reference]
|
|
||||||
ref_sent_starts = [t.is_sent_start for t in example.reference]
|
ref_sent_starts = [t.is_sent_start for t in example.reference]
|
||||||
assert ref_deps == ["nsubj", "ROOT", "dobj", "cc", "conj", MISSING_DEP_]
|
assert ref_has_heads == pred_has_heads
|
||||||
assert ref_has_heads == [True, True, True, True, True, False]
|
assert ref_has_deps == pred_has_heads
|
||||||
assert ref_sent_starts == [True, False, False, False, False, False]
|
assert ref_heads == pred_heads
|
||||||
|
assert ref_deps == pred_deps
|
||||||
|
assert ref_sent_starts == pred_sent_starts
|
||||||
|
# check that the aligned parse preserves the missing information
|
||||||
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
|
aligned_heads, aligned_deps = example.get_aligned_parse(projectivize=True)
|
||||||
|
assert aligned_deps[0] == ref_deps[0]
|
||||||
|
assert aligned_heads[0] == ref_heads[0]
|
||||||
|
assert aligned_deps[5] == ref_deps[5]
|
||||||
assert aligned_heads[5] == ref_heads[5]
|
assert aligned_heads[5] == ref_heads[5]
|
||||||
assert aligned_deps[5] == MISSING_DEP_
|
|
||||||
|
|
|
@ -253,7 +253,7 @@ def test_overfitting_IO(pipe_name):
|
||||||
parser.add_label(dep)
|
parser.add_label(dep)
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
# run overfitting
|
# run overfitting
|
||||||
for i in range(150):
|
for i in range(200):
|
||||||
losses = {}
|
losses = {}
|
||||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
assert losses[pipe_name] < 0.0001
|
assert losses[pipe_name] < 0.0001
|
||||||
|
|
|
@ -16,7 +16,7 @@ from thinc.util import copy_array
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token import MISSING_DEP_
|
from .token cimport MISSING_DEP
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
@ -269,6 +269,7 @@ cdef class Doc:
|
||||||
if heads is not None:
|
if heads is not None:
|
||||||
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
|
heads = [head - i if head is not None else 0 for i, head in enumerate(heads)]
|
||||||
if deps is not None:
|
if deps is not None:
|
||||||
|
MISSING_DEP_ = self.vocab.strings[MISSING_DEP]
|
||||||
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
deps = [dep if dep is not None else MISSING_DEP_ for dep in deps]
|
||||||
if deps and not heads:
|
if deps and not heads:
|
||||||
heads = [0] * len(deps)
|
heads = [0] * len(deps)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
cdef int MISSING_DEP = 0
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
@ -95,9 +96,12 @@ cdef class Token:
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
token.sent_start = value
|
token.sent_start = value
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||||
|
return token.dep == MISSING_DEP
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline int missing_head(const TokenC* token) nogil:
|
cdef inline int missing_head(const TokenC* token) nogil:
|
||||||
if token.dep == 0:
|
return Token.missing_dep(token)
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
|
@ -22,8 +22,6 @@ from .. import parts_of_speech
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
MISSING_DEP_ = ""
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
"""An individual token – i.e. a word, punctuation symbol, whitespace,
|
||||||
|
@ -640,14 +638,13 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
return any(ancestor.i == self.i for ancestor in descendant.ancestors)
|
||||||
|
|
||||||
|
|
||||||
def has_head(self):
|
def has_head(self):
|
||||||
"""Check whether the token has annotated head information.
|
"""Check whether the token has annotated head information.
|
||||||
|
Return False when the head annotation is unset/missing.
|
||||||
|
|
||||||
RETURNS (bool): Whether the head annotation is valid or not.
|
RETURNS (bool): Whether the head annotation is valid or not.
|
||||||
"""
|
"""
|
||||||
return self.dep_ != MISSING_DEP_
|
return not Token.missing_head(self.c)
|
||||||
|
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The syntactic parent, or "governor", of this token.
|
"""The syntactic parent, or "governor", of this token.
|
||||||
|
@ -873,6 +870,14 @@ cdef class Token:
|
||||||
def __set__(self, tag):
|
def __set__(self, tag):
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
|
def has_dep(self):
|
||||||
|
"""Check whether the token has annotated dep information.
|
||||||
|
Returns False when the dep label is unset/missing.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the dep label is valid or not.
|
||||||
|
"""
|
||||||
|
return not Token.missing_dep(self.c)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
"""RETURNS (str): The syntactic dependency label."""
|
"""RETURNS (str): The syntactic dependency label."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -12,7 +12,7 @@ from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||||
from .iob_utils import biluo_tags_to_spans
|
from .iob_utils import biluo_tags_to_spans
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..tokens.token import MISSING_DEP_
|
from ..tokens.token cimport MISSING_DEP
|
||||||
from ..util import logger
|
from ..util import logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -180,14 +180,15 @@ cdef class Example:
|
||||||
gold_to_cand = self.alignment.y2x
|
gold_to_cand = self.alignment.y2x
|
||||||
aligned_heads = [None] * self.x.length
|
aligned_heads = [None] * self.x.length
|
||||||
aligned_deps = [None] * self.x.length
|
aligned_deps = [None] * self.x.length
|
||||||
|
has_deps = [token.has_dep() for token in self.y]
|
||||||
has_heads = [token.has_head() for token in self.y]
|
has_heads = [token.has_head() for token in self.y]
|
||||||
heads = [token.head.i for token in self.y]
|
heads = [token.head.i for token in self.y]
|
||||||
deps = [token.dep_ for token in self.y]
|
deps = [token.dep_ for token in self.y]
|
||||||
if projectivize:
|
if projectivize:
|
||||||
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
proj_heads, proj_deps = nonproj.projectivize(heads, deps)
|
||||||
# ensure that data that was previously missing, remains missing
|
# ensure that missing data remains missing
|
||||||
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
heads = [h if has_heads[i] else heads[i] for i, h in enumerate(proj_heads)]
|
||||||
deps = [d if deps[i] != MISSING_DEP_ else MISSING_DEP_ for i, d in enumerate(proj_deps)]
|
deps = [d if has_deps[i] else deps[i] for i, d in enumerate(proj_deps)]
|
||||||
for cand_i in range(self.x.length):
|
for cand_i in range(self.x.length):
|
||||||
if cand_to_gold.lengths[cand_i] == 1:
|
if cand_to_gold.lengths[cand_i] == 1:
|
||||||
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||||
|
@ -337,8 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
|
||||||
elif key == "DEP":
|
elif key == "DEP":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
value = [v if v is not None else MISSING_DEP_ for v in value]
|
values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
|
||||||
values.append([vocab.strings.add(h) for h in value])
|
|
||||||
elif key == "SENT_START":
|
elif key == "SENT_START":
|
||||||
attrs.append(key)
|
attrs.append(key)
|
||||||
values.append(value)
|
values.append(value)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user