Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Ines Montani 2020-09-17 12:34:15 +02:00
commit c8fa2247e3
69 changed files with 563 additions and 592 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a18" __version__ = "3.0.0a19"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
if not doc.is_parsed: if not doc.has_annotation("DEP"):
warnings.warn(Warnings.W005) warnings.warn(Warnings.W005)
if options.get("collapse_phrases", False): if options.get("collapse_phrases", False):
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:

View File

@ -119,6 +119,11 @@ class Warnings:
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
"need to match on a stream of documents, you can use nlp.pipe and " "need to match on a stream of documents, you can use nlp.pipe and "
"call the {matcher} on each Doc object.") "call the {matcher} on each Doc object.")
W106 = ("Both HEAD and SENT_START are included as attributes in "
"doc.from_array(). The parse trees based on the HEAD attribute "
"will override the values in SENT_START.")
W107 = ("The property Doc.{prop} is deprecated. Use "
"Doc.has_annotation(\"{attr}\") instead.")
@add_codes @add_codes
@ -192,11 +197,6 @@ class Errors:
"Alternatively, add the dependency parser, or set sentence " "Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.") "boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.") E031 = ("Invalid token: empty string ('') at position {i}.")
E032 = ("Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
"boundaries implicitly, based on the tree structure. This means "
"the HEAD attribute would potentially override the sentence "
"boundaries set by SENT_START.")
E033 = ("Cannot load into non-empty Doc of length {length}.") E033 = ("Cannot load into non-empty Doc of length {length}.")
E035 = ("Error creating span with start {start} and end {end} for Doc of " E035 = ("Error creating span with start {start} and end {end} for Doc of "
"length {length}.") "length {length}.")
@ -397,8 +397,8 @@ class Errors:
E154 = ("One of the attributes or values is not supported for token " E154 = ("One of the attributes or values is not supported for token "
"patterns. Please use the option validate=True with Matcher, " "patterns. Please use the option validate=True with Matcher, "
"PhraseMatcher, or EntityRuler for more details.") "PhraseMatcher, or EntityRuler for more details.")
E155 = ("The pipeline needs to include a tagger in order to use " E155 = ("The pipeline needs to include a {pipe} in order to use "
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. " "Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).") "instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use " E156 = ("The pipeline needs to include a parser in order to use "

View File

@ -1,7 +1,11 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
class BengaliDefaults(Language.Defaults): class BengaliDefaults(Language.Defaults):
@ -17,4 +21,22 @@ class Bengali(Language):
Defaults = BengaliDefaults Defaults = BengaliDefaults
@Bengali.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"] labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_label = doc.vocab.strings.add("NP") np_label = doc.vocab.strings.add("NP")
np_deps = set(doc.vocab.strings.add(label) for label in labels) np_deps = set(doc.vocab.strings.add(label) for label in labels)

View File

@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
# Further improvement of the models will eliminate the need for this tag. # Further improvement of the models will eliminate the need for this tag.
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"] labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
"""Detect base noun phrases from a dependency parse. Works on Doc and Span.""" """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
doc = doclike.doc doc = doclike.doc
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
if not len(doc): if not len(doc):
return return

View File

@ -1,9 +1,13 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
class PersianDefaults(Language.Defaults): class PersianDefaults(Language.Defaults):
@ -20,4 +24,22 @@ class Persian(Language):
Defaults = PersianDefaults Defaults = PersianDefaults
@Persian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -19,7 +19,7 @@ def noun_chunks(doclike):
] ]
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings.add(label) for label in labels] np_deps = [doc.vocab.strings.add(label) for label in labels]

View File

@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -1,9 +1,13 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
class NorwegianDefaults(Language.Defaults): class NorwegianDefaults(Language.Defaults):
@ -20,4 +24,22 @@ class Norwegian(Language):
Defaults = NorwegianDefaults Defaults = NorwegianDefaults
@Norwegian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -1,8 +1,13 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
# Punctuation stolen from Danish # Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
@ -22,4 +27,22 @@ class Swedish(Language):
Defaults = SwedishDefaults Defaults = SwedishDefaults
@Swedish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on # fmt: on
doc = doclike.doc # Ensure works on both Doc and Span. doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.is_parsed: if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029) raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels] np_deps = [doc.vocab.strings[label] for label in labels]
conj = doc.vocab.strings.add("conj") conj = doc.vocab.strings.add("conj")

View File

@ -8,7 +8,7 @@ from contextlib import contextmanager
from copy import deepcopy from copy import deepcopy
from pathlib import Path from pathlib import Path
import warnings import warnings
from thinc.api import get_current_ops, Config, require_gpu, Optimizer from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
import srsly import srsly
import multiprocessing as mp import multiprocessing as mp
from itertools import chain, cycle from itertools import chain, cycle
@ -1448,10 +1448,15 @@ class Language:
"""Register 'listeners' within pipeline components, to allow them to """Register 'listeners' within pipeline components, to allow them to
effectively share weights. effectively share weights.
""" """
# I had though, "Why do we do this inside the Language object? Shouldn't
# it be the tok2vec/transformer/etc's job?
# The problem is we need to do it during deserialization...And the
# components don't receive the pipeline then. So this does have to be
# here :(
for i, (name1, proc1) in enumerate(self.pipeline): for i, (name1, proc1) in enumerate(self.pipeline):
if hasattr(proc1, "find_listeners"): if hasattr(proc1, "find_listeners"):
for name2, proc2 in self.pipeline[i:]: for name2, proc2 in self.pipeline[i+1:]:
if hasattr(proc2, "model"): if isinstance(getattr(proc2, "model", None), Model):
proc1.find_listeners(proc2.model) proc1.find_listeners(proc2.model)
@classmethod @classmethod

View File

@ -17,7 +17,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
@ -215,10 +215,15 @@ cdef class Matcher:
else: else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool() cdef Pool tmp_pool = Pool()
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
and not doc.is_tagged: raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
raise ValueError(Errors.E155.format()) if POS in self._seen_attrs and not doc.has_annotation("POS"):
if DEP in self._seen_attrs and not doc.is_parsed: raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
raise ValueError(Errors.E156.format()) raise ValueError(Errors.E156.format())
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates) extensions=self._extensions, predicates=self._extra_predicates)

View File

@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
import warnings import warnings
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.span cimport Span from ..tokens.span cimport Span
@ -184,12 +184,20 @@ cdef class PhraseMatcher:
if len(doc) == 0: if len(doc) == 0:
continue continue
if isinstance(doc, Doc): if isinstance(doc, Doc):
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged: attrs = (TAG, POS, MORPH, LEMMA, DEP)
raise ValueError(Errors.E155.format()) has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
if self.attr == DEP and not doc.is_parsed: if self.attr == TAG and not has_annotation[TAG]:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
if self.attr == POS and not has_annotation[POS]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if self.attr == MORPH and not has_annotation[MORPH]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if self.attr == LEMMA and not has_annotation[LEMMA]:
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if self.attr == DEP and not has_annotation[DEP]:
raise ValueError(Errors.E156.format()) raise ValueError(Errors.E156.format())
if self._validate and (doc.is_tagged or doc.is_parsed) \ if self._validate and any(has_annotation.values()) \
and self.attr not in (DEP, POS, TAG, LEMMA): and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr] string_attr = self.vocab.strings[self.attr]
warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
keyword = self._convert_to_array(doc) keyword = self._convert_to_array(doc)

View File

@ -164,7 +164,7 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1") @registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int): def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
"""Construct an embedded representation based on character embeddings, using """Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is each word, taken from the beginning and end of the word equally. Padding is
@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the are between 3 and 8, although it may depend on the length of words in the
language. language.
also_use_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
model = chain( if also_use_static_vectors:
concatenate( model = chain(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), concatenate(
chain( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
FeatureExtractor([NORM]), chain(
list2ragged(), FeatureExtractor([NORM]),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
StaticVectors(width, dropout=0.0),
), ),
), with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), ragged2list(),
ragged2list(), )
else:
model = chain(
concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
chain(
FeatureExtractor([NORM]),
list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
ragged2list(),
) )
return model return model

View File

@ -679,8 +679,7 @@ cdef class ArcEager(TransitionSystem):
st._sent[i].dep = self.root_label st._sent[i].dep = self.root_label
def finalize_doc(self, Doc doc): def finalize_doc(self, Doc doc):
doc.is_parsed = True set_children_from_heads(doc.c, 0, doc.length)
set_children_from_heads(doc.c, doc.length)
def has_gold(self, Example eg, start=0, end=None): def has_gold(self, Example eg, start=0, end=None):
for word in eg.y[start:end]: for word in eg.y[start:end]:

View File

@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
new_head = _find_new_head(doc[i], head_label) new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label) doc.c[i].dep = doc.vocab.strings.add(new_label)
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, 0, doc.length)
return doc return doc

View File

@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
""" """
if not doc.is_parsed: if not doc.has_annotation("DEP"):
return doc return doc
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
for np in doc.noun_chunks: for np in doc.noun_chunks:

View File

@ -32,6 +32,7 @@ width = 128
rows = 7000 rows = 7000
nM = 64 nM = 64
nC = 8 nC = 8
also_use_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1" @architectures = "spacy.MaxoutWindowEncoder.v1"
@ -203,8 +204,6 @@ class Morphologizer(Tagger):
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
doc.c[j].pos = self.cfg["labels_pos"][morph] doc.c[j].pos = self.cfg["labels_pos"][morph]
doc.is_morphed = True
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and """Find the loss and gradient of loss for the batch of documents and
their predicted scores. their predicted scores.
@ -259,79 +258,3 @@ class Morphologizer(Tagger):
results.update(Scorer.score_token_attr_per_feat(examples, results.update(Scorer.score_token_attr_per_feat(examples,
"morph", **kwargs)) "morph", **kwargs))
return results return results
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
"""
serialize = {}
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
bytes_data (bytes): The serialized pipe.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Morphologizer): The loaded Morphologizer.
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
"""
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda b: self.vocab.from_bytes(b),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b),
}
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, *, exclude=tuple()):
"""Serialize the pipe to disk.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
"""
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
"cfg": lambda p: srsly.write_json(p, self.cfg),
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Load the pipe from disk. Modifies the object in place and returns it.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Morphologizer): The modified Morphologizer object.
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
"""
def load_model(p):
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda p: self.vocab.from_disk(p),
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"model": load_model,
}
util.from_disk(path, deserialize, exclude)
return self

View File

@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
results = Scorer.score_spans(examples, "sents", **kwargs) results = Scorer.score_spans(examples, "sents", **kwargs)
del results["sents_per_type"] del results["sents_per_type"]
return results return results
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
"""
serialize = {}
serialize["model"] = self.model.to_bytes
serialize["vocab"] = self.vocab.to_bytes
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
bytes_data (bytes): The serialized pipe.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Tagger): The loaded SentenceRecognizer.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
"""
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda b: self.vocab.from_bytes(b),
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
"model": lambda b: load_model(b),
}
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(self, path, *, exclude=tuple()):
"""Serialize the pipe to disk.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
"""
serialize = {
"vocab": lambda p: self.vocab.to_disk(p),
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
"cfg": lambda p: srsly.write_json(p, self.cfg),
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Load the pipe from disk. Modifies the object in place and returns it.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Tagger): The modified SentenceRecognizer object.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
"""
def load_model(p):
with p.open("rb") as file_:
try:
self.model.from_bytes(file_.read())
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {
"vocab": lambda p: self.vocab.from_disk(p),
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
"model": load_model,
}
util.from_disk(path, deserialize, exclude)
return self

View File

@ -168,7 +168,6 @@ class Tagger(Pipe):
# Don't clobber preset POS tags # Don't clobber preset POS tags
if doc.c[j].tag == 0: if doc.c[j].tag == 0:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
doc.is_tagged = True
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
"""Learn from a batch of documents and gold-standard information, """Learn from a batch of documents and gold-standard information,

View File

@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
tokens[0].lemma_ = "lemma" tokens[0].lemma_ = "lemma"
tokens[0].norm_ = "norm" tokens[0].norm_ = "norm"
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
tokens[0].ent_kb_id_ = "ent_kb_id" tokens[0].ent_kb_id_ = "ent_kb_id"
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
assert tokens.text == new_tokens.text assert tokens.text == new_tokens.text
@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer):
doc = en_tokenizer("") doc = en_tokenizer("")
doc.is_parsed = True
sents = list(doc.sents) sents = list(doc.sents)
assert len(sents) == 0 assert len(sents) == 0
@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue." text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1, heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26] -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
deps = ["dep"] * len(heads)
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert doc[6].text == "for" assert doc[6].text == "for"
subtree = [w.text for w in doc[6].subtree] subtree = [w.text for w in doc[6].subtree]
# fmt: off # fmt: off
@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
) )
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
tokens = en_tokenizer(sentence) tokens = en_tokenizer(sentence)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) doc = get_doc(
tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
)
lca = doc.get_lca_matrix() lca = doc.get_lca_matrix()
assert (lca == lca_matrix).all() assert (lca == lca_matrix).all()
assert lca[1, 1] == 1 assert lca[1, 1] == 1
@ -251,51 +254,55 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
def test_doc_is_nered(en_vocab): def test_doc_is_nered(en_vocab):
words = ["I", "live", "in", "New", "York"] words = ["I", "live", "in", "New", "York"]
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
assert not doc.is_nered assert not doc.has_annotation("ENT_IOB")
doc.ents = [Span(doc, 3, 5, label="GPE")] doc.ents = [Span(doc, 3, 5, label="GPE")]
assert doc.is_nered assert doc.has_annotation("ENT_IOB")
# Test creating doc from array with unknown values # Test creating doc from array with unknown values
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
assert doc.is_nered assert doc.has_annotation("ENT_IOB")
# Test serialization # Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.is_nered assert new_doc.has_annotation("ENT_IOB")
def test_doc_from_array_sent_starts(en_vocab): def test_doc_from_array_sent_starts(en_vocab):
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
# fmt: off # fmt: off
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
# fmt: on # fmt: on
doc = Doc(en_vocab, words=words) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
for i, (dep, head) in enumerate(zip(deps, heads)):
doc[i].dep_ = dep
doc[i].head = doc[head]
if head == i:
doc[i].is_sent_start = True
doc.is_parsed
# HEAD overrides SENT_START with warning
attrs = [SENT_START, HEAD] attrs = [SENT_START, HEAD]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
with pytest.raises(ValueError): with pytest.warns(UserWarning):
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
attrs = [SENT_START, DEP] # no warning using default attrs
attrs = doc._get_array_attrs()
arr = doc.to_array(attrs)
with pytest.warns(None) as record:
new_doc.from_array(attrs, arr)
assert len(record) == 0
# only SENT_START uses SENT_START
attrs = [SENT_START]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
assert not new_doc.is_parsed assert not new_doc.has_annotation("DEP")
# only HEAD uses HEAD
attrs = [HEAD, DEP] attrs = [HEAD, DEP]
arr = doc.to_array(attrs) arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words) new_doc = Doc(en_vocab, words=words)
new_doc.from_array(attrs, arr) new_doc.from_array(attrs, arr)
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
assert new_doc.is_parsed assert new_doc.has_annotation("DEP")
def test_doc_from_array_morph(en_vocab): def test_doc_from_array_morph(en_vocab):
@ -365,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx assert m_doc[9].idx == think_idx
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
with pytest.raises(ValueError):
# important attributes from sentenziser or parser are missing
assert list(m_doc.sents)
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing # space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty) assert str(m_doc) == " ".join(en_texts_without_empty)
@ -379,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx assert m_doc[9].idx == think_idx
def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."]
docs = [en_tokenizer(t) for t in texts]
docs[0].ents = ()
docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
doc = Doc.from_docs(docs)
assert len(doc.ents) == 1
def test_doc_lang(en_vocab): def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"]) doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en" assert doc.lang_ == "en"
@ -399,3 +412,45 @@ def test_token_lexeme(en_vocab):
assert isinstance(token.lex, Lexeme) assert isinstance(token.lex, Lexeme)
assert token.lex.text == token.text assert token.lex.text == token.text
assert en_vocab[token.orth] == token.lex assert en_vocab[token.orth] == token.lex
def test_has_annotation(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
for attr in attrs:
assert not doc.has_annotation(attr)
doc[0].tag_ = "A"
doc[0].pos_ = "X"
doc[0].morph_ = "Feat=Val"
doc[0].lemma_ = "a"
doc[0].dep_ = "dep"
doc[0].head = doc[1]
doc.ents = [Span(doc, 0, 1, label="HELLO")]
for attr in attrs:
assert doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[1].tag_ = "A"
doc[1].pos_ = "X"
doc[1].morph_ = ""
doc[1].lemma_ = "a"
doc[1].dep_ = "dep"
doc.ents = [Span(doc, 0, 2, label="HELLO")]
for attr in attrs:
assert doc.has_annotation(attr)
assert doc.has_annotation(attr, require_complete=True)
def test_is_flags_deprecated(en_tokenizer):
doc = en_tokenizer("test")
with pytest.deprecated_call():
doc.is_tagged
with pytest.deprecated_call():
doc.is_parsed
with pytest.deprecated_call():
doc.is_nered
with pytest.deprecated_call():
doc.is_sentenced

View File

@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
text = "This is a sentence. This is another sentence. And a third." text = "This is a sentence. This is another sentence. And a third."
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = Doc(tokens.vocab, words=[t.text for t in tokens]) doc = Doc(tokens.vocab, words=[t.text for t in tokens])
doc.is_parsed = False
return doc return doc
@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
def test_spans_root2(en_tokenizer): def test_spans_root2(en_tokenizer):
text = "through North and South Carolina" text = "through North and South Carolina"
heads = [0, 3, -1, -2, -4] heads = [0, 3, -1, -2, -4]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert doc[-2:].root.text == "Carolina" assert doc[-2:].root.text == "Carolina"
@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
def test_spans_lca_matrix(en_tokenizer): def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation""" """Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept") tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0]) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
lca = doc[:2].get_lca_matrix() lca = doc[:2].get_lca_matrix()
assert lca.shape == (2, 2) assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the assert lca[0, 0] == 0 # the & the -> the

View File

@ -112,11 +112,11 @@ def test_doc_token_api_ancestors(en_tokenizer):
def test_doc_token_api_head_setter(en_tokenizer): def test_doc_token_api_head_setter(en_tokenizer):
# the structure of this sentence depends on the English annotation scheme
text = "Yesterday I saw a dog that barked loudly." text = "Yesterday I saw a dog that barked loudly."
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6] heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert doc[6].n_lefts == 1 assert doc[6].n_lefts == 1
assert doc[6].n_rights == 1 assert doc[6].n_rights == 1
@ -169,13 +169,46 @@ def test_doc_token_api_head_setter(en_tokenizer):
with pytest.raises(ValueError): with pytest.raises(ValueError):
doc[0].head = doc2[0] doc[0].head = doc2[0]
# test sentence starts when two sentences are joined
text = "This is one sentence. This is another sentence."
heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=["dep"] * len(heads),
)
# initially two sentences
assert doc[0].is_sent_start
assert doc[5].is_sent_start
assert doc[0].left_edge == doc[0]
assert doc[0].right_edge == doc[4]
assert doc[5].left_edge == doc[5]
assert doc[5].right_edge == doc[9]
# modifying with a sentence doesn't change sent starts
doc[2].head = doc[3]
assert doc[0].is_sent_start
assert doc[5].is_sent_start
assert doc[0].left_edge == doc[0]
assert doc[0].right_edge == doc[4]
assert doc[5].left_edge == doc[5]
assert doc[5].right_edge == doc[9]
# attach the second sentence to the first, resulting in one sentence
doc[5].head = doc[0]
assert doc[0].is_sent_start
assert not doc[5].is_sent_start
assert doc[0].left_edge == doc[0]
assert doc[0].right_edge == doc[9]
def test_is_sent_start(en_tokenizer): def test_is_sent_start(en_tokenizer):
doc = en_tokenizer("This is a sentence. This is another.") doc = en_tokenizer("This is a sentence. This is another.")
assert doc[5].is_sent_start is None assert doc[5].is_sent_start is None
doc[5].is_sent_start = True doc[5].is_sent_start = True
assert doc[5].is_sent_start is True assert doc[5].is_sent_start is True
doc.is_parsed = True
assert len(list(doc.sents)) == 2 assert len(list(doc.sents)) == 2
@ -184,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
assert doc[4].is_sent_end is None assert doc[4].is_sent_end is None
doc[5].is_sent_start = True doc[5].is_sent_start = True
assert doc[4].is_sent_end is True assert doc[4].is_sent_end is True
doc.is_parsed = True
assert len(list(doc.sents)) == 2 assert len(list(doc.sents)) == 2
@ -209,14 +241,14 @@ def test_token0_has_sent_start_true():
doc = Doc(Vocab(), words=["hello", "world"]) doc = Doc(Vocab(), words=["hello", "world"])
assert doc[0].is_sent_start is True assert doc[0].is_sent_start is True
assert doc[1].is_sent_start is None assert doc[1].is_sent_start is None
assert not doc.is_sentenced assert not doc.has_annotation("SENT_START")
def test_tokenlast_has_sent_end_true(): def test_tokenlast_has_sent_end_true():
doc = Doc(Vocab(), words=["hello", "world"]) doc = Doc(Vocab(), words=["hello", "world"])
assert doc[0].is_sent_end is None assert doc[0].is_sent_end is None
assert doc[1].is_sent_end is True assert doc[1].is_sent_end is True
assert not doc.is_sentenced assert not doc.has_annotation("SENT_START")
def test_token_api_conjuncts_chain(en_vocab): def test_token_api_conjuncts_chain(en_vocab):

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_de(de_tokenizer): def test_noun_chunks_is_parsed_de(de_tokenizer):
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = de_tokenizer("Er lag auf seinem") doc = de_tokenizer("Er lag auf seinem")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_el(el_tokenizer): def test_noun_chunks_is_parsed_el(el_tokenizer):
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -11,12 +11,8 @@ from ...util import get_doc
def test_noun_chunks_is_parsed(en_tokenizer): def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = en_tokenizer("This is a sentence") doc = en_tokenizer("This is a sentence")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
@pytest.mark.parametrize("punct", [".", "!", "?", ""]) @pytest.mark.parametrize("punct", [".", "!", "?", ""])
def test_en_sbd_single_punct(en_tokenizer, text, punct): def test_en_sbd_single_punct(en_tokenizer, text, punct):
heads = [2, 1, 0, -1] if punct else [2, 1, 0] heads = [2, 1, 0, -1] if punct else [2, 1, 0]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text + punct) tokens = en_tokenizer(text + punct)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert len(doc) == 4 if punct else 3 assert len(doc) == 4 if punct else 3
assert len(list(doc.sents)) == 1 assert len(list(doc.sents)) == 1
assert sum(len(sent) for sent in doc.sents) == len(doc) assert sum(len(sent) for sent in doc.sents) == len(doc)

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_es(es_tokenizer): def test_noun_chunks_is_parsed_es(es_tokenizer):
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = es_tokenizer("en Oxford este verano") doc = es_tokenizer("en Oxford este verano")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -3,12 +3,8 @@ import pytest
def test_noun_chunks_is_parsed_fa(fa_tokenizer): def test_noun_chunks_is_parsed_fa(fa_tokenizer):
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = fa_tokenizer("این یک جمله نمونه می باشد.") doc = fa_tokenizer("این یک جمله نمونه می باشد.")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_fr(fr_tokenizer): def test_noun_chunks_is_parsed_fr(fr_tokenizer):
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = fr_tokenizer("trouver des travaux antérieurs") doc = fr_tokenizer("trouver des travaux antérieurs")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_id(id_tokenizer): def test_noun_chunks_is_parsed_id(id_tokenizer):
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = id_tokenizer("sebelas") doc = id_tokenizer("sebelas")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -3,11 +3,7 @@ import pytest
def test_noun_chunks_is_parsed_nb(nb_tokenizer): def test_noun_chunks_is_parsed_nb(nb_tokenizer):
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = nb_tokenizer("Smørsausen brukes bl.a. til") doc = nb_tokenizer("Smørsausen brukes bl.a. til")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -5,12 +5,8 @@ from ...util import get_doc
def test_noun_chunks_is_parsed_sv(sv_tokenizer): def test_noun_chunks_is_parsed_sv(sv_tokenizer):
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
To check this test, we're constructing a Doc
with a new Vocab here and forcing is_parsed to 'False'
to make sure the noun chunks don't run.
""" """
doc = sv_tokenizer("Studenten läste den bästa boken") doc = sv_tokenizer("Studenten läste den bästa boken")
doc.is_parsed = False
with pytest.raises(ValueError): with pytest.raises(ValueError):
list(doc.noun_chunks) list(doc.noun_chunks)

View File

@ -8,7 +8,7 @@ from spacy.util import get_lang_class
# Only include languages with no external dependencies # Only include languages with no external dependencies
# excluded: ru, uk # excluded: ru, uk
# excluded for custom tables: pl # excluded for custom tables: pl
LANGUAGES = ["el", "en", "fr", "nl"] LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
# fmt: on # fmt: on

View File

@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
def test_attr_pipeline_checks(en_vocab): def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"]) doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True doc1[0].dep_ = "ROOT"
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed # DEP requires DEP
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"DEP": "a"}]]) matcher.add("TEST", [[{"DEP": "a"}]])
matcher(doc1) matcher(doc1)
@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2) matcher(doc2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher(doc3) matcher(doc3)
# TAG, POS, LEMMA require is_tagged # TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", [[{attr: "a"}]]) matcher.add("TEST", [[{attr: "a"}]])

View File

@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
def test_phrase_matcher_validation(en_vocab): def test_phrase_matcher_validation(en_vocab):
doc1 = Doc(en_vocab, words=["Test"]) doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True doc1[0].dep_ = "ROOT"
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True) matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
def test_attr_pipeline_checks(en_vocab): def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"]) doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True doc1[0].dep_ = "ROOT"
doc2 = Doc(en_vocab, words=["Test"]) doc2 = Doc(en_vocab, words=["Test"])
doc2.is_tagged = True doc2[0].tag_ = "TAG"
doc2[0].pos_ = "X"
doc2[0].morph_ = "Feat=Val"
doc2[0].lemma_ = "LEMMA"
doc3 = Doc(en_vocab, words=["Test"]) doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed # DEP requires DEP
matcher = PhraseMatcher(en_vocab, attr="DEP") matcher = PhraseMatcher(en_vocab, attr="DEP")
matcher.add("TEST1", [doc1]) matcher.add("TEST1", [doc1])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST2", [doc2]) matcher.add("TEST2", [doc2])
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher.add("TEST3", [doc3]) matcher.add("TEST3", [doc3])
# TAG, POS, LEMMA require is_tagged # TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr) matcher = PhraseMatcher(en_vocab, attr=attr)
matcher.add("TEST2", [doc2]) matcher.add("TEST2", [doc2])

View File

@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
def test_parser_parse_subtrees(en_tokenizer, en_parser): def test_parser_parse_subtrees(en_tokenizer, en_parser):
text = "The four wheels on the bus turned quickly" text = "The four wheels on the bus turned quickly"
heads = [2, 1, 4, -1, 1, -2, 0, -1] heads = [2, 1, 4, -1, 1, -2, 0, -1]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert len(list(doc[2].lefts)) == 2 assert len(list(doc[2].lefts)) == 2
assert len(list(doc[2].rights)) == 1 assert len(list(doc[2].rights)) == 1
assert len(list(doc[2].children)) == 3 assert len(list(doc[2].children)) == 3
@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
if i == 0 or i == 3: if i == 0 or i == 3:
assert doc[i].is_sent_start is True assert doc[i].is_sent_start is True
else: else:
assert doc[i].is_sent_start is None assert doc[i].is_sent_start is False
for sent in doc.sents: for sent in doc.sents:
for token in sent: for token in sent:
assert token.head in sent assert token.head in sent

View File

@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
lefts = {} lefts = {}
rights = {} rights = {}

View File

@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
def test_parser_space_attachment(en_tokenizer): def test_parser_space_attachment(en_tokenizer):
text = "This is a test.\nTo ensure spaces are attached well." text = "This is a test.\nTo ensure spaces are attached well."
heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2] heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
deps = ["dep"] * len(heads)
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
for sent in doc.sents: for sent in doc.sents:
if len(sent) == 1: if len(sent) == 1:
assert not sent[-1].is_space assert not sent[-1].is_space

View File

@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
def test_attributeruler_init_patterns(nlp, pattern_dicts): def test_attributeruler_init_patterns(nlp, pattern_dicts):
@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler") nlp.remove_pipe("attribute_ruler")
# initialize with patterns from asset # initialize with patterns from asset
nlp.add_pipe( nlp.add_pipe(
@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[2].morph_ == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat" assert doc[3].lemma_ == "cat"
assert doc[3].morph_ == "Case=Nom|Number=Sing" assert doc[3].morph_ == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
def test_attributeruler_score(nlp, pattern_dicts): def test_attributeruler_score(nlp, pattern_dicts):

View File

@ -35,8 +35,6 @@ def doc2(en_tokenizer):
deps=deps, deps=deps,
) )
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])] doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
doc.is_parsed = True
doc.is_tagged = True
return doc return doc

View File

@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
sentencizer = Sentencizer(punct_chars=None) sentencizer = Sentencizer(punct_chars=None)
doc = sentencizer(doc) doc = sentencizer(doc)
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
sent_starts = [t.is_sent_start for t in doc] sent_starts = [t.is_sent_start for t in doc]
sent_ends = [t.is_sent_end for t in doc] sent_ends = [t.is_sent_end for t in doc]
assert sent_starts == [True, False, True, False, False, False, False] assert sent_starts == [True, False, True, False, False, False, False]
@ -22,13 +22,13 @@ def test_sentencizer_pipe():
nlp = English() nlp = English()
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")
for doc in nlp.pipe(texts): for doc in nlp.pipe(texts):
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
sent_starts = [t.is_sent_start for t in doc] sent_starts = [t.is_sent_start for t in doc]
assert sent_starts == [True, False, True, False, False, False, False] assert sent_starts == [True, False, True, False, False, False, False]
assert len(list(doc.sents)) == 2 assert len(list(doc.sents)) == 2
for ex in nlp.pipe(texts): for ex in nlp.pipe(texts):
doc = ex.doc doc = ex.doc
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
sent_starts = [t.is_sent_start for t in doc] sent_starts = [t.is_sent_start for t in doc]
assert sent_starts == [True, False, True, False, False, False, False] assert sent_starts == [True, False, True, False, False, False, False]
assert len(list(doc.sents)) == 2 assert len(list(doc.sents)) == 2
@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")
for texts in [one_empty_text, many_empty_texts, some_empty_texts]: for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
for doc in nlp.pipe(texts): for doc in nlp.pipe(texts):
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
sent_starts = [t.is_sent_start for t in doc] sent_starts = [t.is_sent_start for t in doc]
if len(doc) == 0: if len(doc) == 0:
assert sent_starts == [] assert sent_starts == []
@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer(punct_chars=None) sentencizer = Sentencizer(punct_chars=None)
doc = sentencizer(doc) doc = sentencizer(doc)
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_start for t in doc] == sent_starts
assert [t.is_sent_end for t in doc] == sent_ends assert [t.is_sent_end for t in doc] == sent_ends
assert len(list(doc.sents)) == n_sents assert len(list(doc.sents)) == n_sents
@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
doc = Doc(en_vocab, words=words) doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer(punct_chars=punct_chars) sentencizer = Sentencizer(punct_chars=punct_chars)
doc = sentencizer(doc) doc = sentencizer(doc)
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_start for t in doc] == sent_starts
assert [t.is_sent_end for t in doc] == sent_ends assert [t.is_sent_end for t in doc] == sent_ends
assert len(list(doc.sents)) == n_sents assert len(list(doc.sents)) == n_sents

View File

@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
doc = get_doc( doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"] tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
) )
doc.is_parsed = True
assert len(doc) == 1 assert len(doc) == 1
sents = list(doc.sents) sents = list(doc.sents)
assert len(sents) == 1 assert len(sents) == 1
@ -170,11 +169,9 @@ def test_issue595():
def test_issue599(en_vocab): def test_issue599(en_vocab):
doc = Doc(en_vocab) doc = Doc(en_vocab)
doc.is_tagged = True
doc.is_parsed = True
doc2 = Doc(doc.vocab) doc2 = Doc(doc.vocab)
doc2.from_bytes(doc.to_bytes()) doc2.from_bytes(doc.to_bytes())
assert doc2.is_parsed assert doc2.has_annotation("DEP")
def test_issue600(): def test_issue600():

View File

@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
from spacy.attrs import HEAD, DEP from spacy.attrs import HEAD, DEP
from spacy.matcher import Matcher from spacy.matcher import Matcher
from ..util import make_tempdir from ..util import make_tempdir, get_doc
def test_issue1506(): def test_issue1506():
@ -198,17 +198,26 @@ def test_issue1834():
"""Test that sentence boundaries & parse/tag flags are not lost """Test that sentence boundaries & parse/tag flags are not lost
during serialization.""" during serialization."""
string = "This is a first sentence . And another one" string = "This is a first sentence . And another one"
doc = Doc(Vocab(), words=string.split()) words = string.split()
doc[6].sent_start = True doc = get_doc(Vocab(), words=words)
doc[6].is_sent_start = True
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc[6].sent_start assert new_doc[6].sent_start
assert not new_doc.is_parsed assert not new_doc.has_annotation("DEP")
assert not new_doc.is_tagged assert not new_doc.has_annotation("TAG")
doc.is_parsed = True doc = get_doc(
doc.is_tagged = True Vocab(),
words=words,
tags=["TAG"] * len(words),
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
deps=["dep"] * len(words),
)
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
assert new_doc.is_parsed print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
assert new_doc.is_tagged assert new_doc[6].sent_start
assert new_doc.has_annotation("DEP")
assert new_doc.has_annotation("TAG")
def test_issue1868(): def test_issue1868():

View File

@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
def test_issue2361(de_tokenizer): def test_issue2361(de_tokenizer):
chars = ("<", ">", "&", """) chars = ("<", ">", "&", """)
doc = de_tokenizer('< > & " ') doc = de_tokenizer('< > & " ')
doc.is_parsed = True
doc.is_tagged = True
html = render(doc) html = render(doc)
for char in chars: for char in chars:
assert char in html assert char in html
@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
def test_issue2396(en_vocab): def test_issue2396(en_vocab):
words = ["She", "created", "a", "test", "for", "spacy"] words = ["She", "created", "a", "test", "for", "spacy"]
heads = [1, 0, 1, -2, -1, -1] heads = [1, 0, 1, -2, -1, -1]
deps = ["dep"] * len(heads)
matrix = numpy.array( matrix = numpy.array(
[ [
[0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1],
@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
], ],
dtype=numpy.int32, dtype=numpy.int32,
) )
doc = get_doc(en_vocab, words=words, heads=heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
span = doc[:] span = doc[:]
assert (doc.get_lca_matrix() == matrix).all() assert (doc.get_lca_matrix() == matrix).all()
assert (span.get_lca_matrix() == matrix).all() assert (span.get_lca_matrix() == matrix).all()

View File

@ -16,16 +16,16 @@ from ..util import get_doc
def test_issue2564(): def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe.""" """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
tagger.add_label("A") tagger.add_label("A")
nlp.begin_training() nlp.begin_training()
doc = nlp("hello world") doc = nlp("hello world")
assert doc.is_tagged assert doc.has_annotation("TAG")
docs = nlp.pipe(["hello", "world"]) docs = nlp.pipe(["hello", "world"])
piped_doc = next(docs) piped_doc = next(docs)
assert piped_doc.is_tagged assert piped_doc.has_annotation("TAG")
def test_issue2569(en_tokenizer): def test_issue2569(en_tokenizer):
@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4] heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
deps = ["dep"] * len(heads) deps = ["dep"] * len(heads)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert doc[1].is_sent_start is None assert doc[1].is_sent_start is False
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])

View File

@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
ents = [(2, 4, "PERCENT")] ents = [(2, 4, "PERCENT")]
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
assert doc.is_tagged assert doc.has_annotation("TAG")
expected = ("10", "NUM", "CD", "PERCENT") expected = ("10", "NUM", "CD", "PERCENT")
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
def test_issue3199(): def test_issue3199():
"""Test that Span.noun_chunks works correctly if no noun chunks iterator """Test that Span.noun_chunks works correctly if no noun chunks iterator
is available. To make this test future-proof, we're constructing a Doc is available. To make this test future-proof, we're constructing a Doc
with a new Vocab here and setting is_parsed to make sure the noun chunks run. with a new Vocab here and a parse tree to make sure the noun chunks run.
""" """
doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) doc = get_doc(
doc.is_parsed = True Vocab(),
words=["This", "is", "a", "sentence"],
heads=[0, -1, -2, -3],
deps=["dep"] * 4,
)
assert list(doc[0:3].noun_chunks) == [] assert list(doc[0:3].noun_chunks) == []
@ -250,16 +254,16 @@ def test_issue3456():
def test_issue3468(): def test_issue3468():
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
be restored after serialization.""" be restored after serialization."""
nlp = English() nlp = English()
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")
doc = nlp("Hello world") doc = nlp("Hello world")
assert doc[0].is_sent_start assert doc[0].is_sent_start
assert doc.is_sentenced assert doc.has_annotation("SENT_START")
assert len(list(doc.sents)) == 1 assert len(list(doc.sents)) == 1
doc_bytes = doc.to_bytes() doc_bytes = doc.to_bytes()
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
assert new_doc[0].is_sent_start assert new_doc[0].is_sent_start
assert new_doc.is_sentenced assert new_doc.has_annotation("SENT_START")
assert len(list(new_doc.sents)) == 1 assert len(list(new_doc.sents)) == 1

View File

@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
copy of the Doc. copy of the Doc.
""" """
doc = Doc(en_vocab, words=["Hello", "world"]) doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set() doc.user_data["test"] = set()
parse_deps(doc) parse_deps(doc)
@ -386,7 +385,6 @@ def test_issue3959():
doc[0].pos_ = "NOUN" doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN" assert doc[0].pos_ == "NOUN"
# usually this is already True when starting from proper models instead of blank English # usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
file_path = tmp_dir / "my_doc" file_path = tmp_dir / "my_doc"
doc.to_disk(file_path) doc.to_disk(file_path)

View File

@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
for i, token in enumerate(doc): for i, token in enumerate(doc):
token.pos_ = pos[i] token.pos_ = pos[i]
# usually this is already True when starting from proper models instead of blank English # usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
doc_bytes = doc.to_bytes() doc_bytes = doc.to_bytes()
vocab = Vocab() vocab = Vocab()
vocab = vocab.from_bytes(vocab_bytes) vocab = vocab.from_bytes(vocab_bytes)
@ -249,7 +248,7 @@ def test_issue4267():
assert "ner" in nlp.pipe_names assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations # assert that we have correct IOB annotations
doc1 = nlp("hi") doc1 = nlp("hi")
assert doc1.is_nered assert doc1.has_annotation("ENT_IOB")
for token in doc1: for token in doc1:
assert token.ent_iob == 2 assert token.ent_iob == 2
# add entity ruler and run again # add entity ruler and run again
@ -260,7 +259,7 @@ def test_issue4267():
assert "ner" in nlp.pipe_names assert "ner" in nlp.pipe_names
# assert that we still have correct IOB annotations # assert that we still have correct IOB annotations
doc2 = nlp("hi") doc2 = nlp("hi")
assert doc2.is_nered assert doc2.has_annotation("ENT_IOB")
for token in doc2: for token in doc2:
assert token.ent_iob == 2 assert token.ent_iob == 2

View File

@ -298,4 +298,4 @@ def test_language_init_invalid_vocab(value):
err_fragment = "invalid value" err_fragment = "invalid value"
with pytest.raises(ValueError) as e: with pytest.raises(ValueError) as e:
Language(value) Language(value)
assert err_fragment in str(e) assert err_fragment in str(e.value)

View File

@ -80,7 +80,6 @@ def tagged_doc():
doc[i].morph_ = morphs[i] doc[i].morph_ = morphs[i]
if i > 0: if i > 0:
doc[i].is_sent_start = False doc[i].is_sent_start = False
doc.is_tagged = True
return doc return doc

View File

@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
[ [
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
], ],
) )
# fmt: on # fmt: on

View File

@ -12,7 +12,7 @@ from thinc.api import compounding
import pytest import pytest
import srsly import srsly
from .util import make_tempdir from .util import make_tempdir, get_doc
@pytest.fixture @pytest.fixture
@ -26,24 +26,16 @@ def doc():
"NounType=prop|Number=sing", "PunctType=peri"] "NounType=prop|Number=sing", "PunctType=peri"]
# head of '.' is intentionally nonprojective for testing # head of '.' is intentionally nonprojective for testing
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
heads = [head - i for i, head in enumerate(heads)]
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
cats = {"TRAVEL": 1.0, "BAKING": 0.0} cats = {"TRAVEL": 1.0, "BAKING": 0.0}
# fmt: on # fmt: on
nlp = English() nlp = English()
doc = nlp(text) words = [t.text for t in nlp.make_doc(text)]
for i in range(len(tags)): doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
doc[i].tag_ = tags[i]
doc[i].pos_ = pos[i]
doc[i].morph_ = morphs[i]
doc[i].lemma_ = lemmas[i]
doc[i].dep_ = deps[i]
doc[i].head = doc[heads[i]]
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats doc.cats = cats
doc.is_tagged = True
doc.is_parsed = True
return doc return doc
@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
docs = json2docs(data) docs = json2docs(data)
assert len(docs) == 1 assert len(docs) == 1
for doc in docs: for doc in docs:
assert not doc.is_nered assert not doc.has_annotation("ENT_IOB")
for token in doc: for token in doc:
assert token.ent_iob == 0 assert token.ent_iob == 0
eg = Example( eg = Example(

View File

@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, 0, doc.length)
# Make sure ent_iob remains consistent # Make sure ent_iob remains consistent
make_iob_consistent(doc.c, doc.length) make_iob_consistent(doc.c, doc.length)
# Return the merged Python object # Return the merged Python object
@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# set children from head # set children from head
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, 0, doc.length)
def _validate_extensions(extensions): def _validate_extensions(extensions):
@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
def normalize_token_attrs(Vocab vocab, attrs): def normalize_token_attrs(Vocab vocab, attrs):
if "_" in attrs: # Extension attributes if "_" in attrs: # Extension attributes
extensions = attrs["_"] extensions = attrs["_"]
print("EXTENSIONS", extensions)
_validate_extensions(extensions) _validate_extensions(extensions)
attrs = {key: value for key, value in attrs.items() if key != "_"} attrs = {key: value for key, value in attrs.items() if key != "_"}
attrs = intify_attrs(attrs, strings_map=vocab.strings) attrs = intify_attrs(attrs, strings_map=vocab.strings)

View File

@ -13,7 +13,7 @@ from ..errors import Errors
from ..util import ensure_path, SimpleFrozenList from ..util import ensure_path, SimpleFrozenList
# fmt: off # fmt: off
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
# fmt: on # fmt: on

View File

@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
const_TokenC_ptr const_TokenC_ptr
cdef int set_children_from_heads(TokenC* tokens, int length) except -1 cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1 cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
cdef class Doc: cdef class Doc:
@ -49,10 +46,6 @@ cdef class Doc:
cdef TokenC* c cdef TokenC* c
cdef public bint is_tagged
cdef public bint is_parsed
cdef public bint is_morphed
cdef public float sentiment cdef public float sentiment
cdef public dict user_hooks cdef public dict user_hooks
@ -74,5 +67,3 @@ cdef class Doc:
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)
cdef void set_parse(self, const TokenC* parsed) nogil

View File

@ -1,37 +1,34 @@
# cython: infer_types=True, bounds_check=False, profile=True # cython: infer_types=True, bounds_check=False, profile=True
cimport cython cimport cython
cimport numpy as np cimport numpy as np
from libc.string cimport memcpy, memset from libc.string cimport memcpy
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t
import copy import copy
from collections import Counter from collections import Counter
import numpy import numpy
import numpy.linalg
import struct
import srsly import srsly
from thinc.api import get_array_module from thinc.api import get_array_module
from thinc.util import copy_array from thinc.util import copy_array
import warnings import warnings
import copy
from .span cimport Span from .span cimport Span
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport attr_id_t
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attr, intify_attrs, IDS from ..attrs import intify_attr, IDS
from ..util import normalize_slice
from ..compat import copy_reg, pickle from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..morphology import Morphology
from .. import util from .. import util
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
from ._retokenize import Retokenizer from ._retokenize import Retokenizer
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
DEF PADDING = 5 DEF PADDING = 5
@ -190,8 +187,6 @@ cdef class Doc:
self.c = data_start + PADDING self.c = data_start + PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
self.is_tagged = False
self.is_parsed = False
self.sentiment = 0.0 self.sentiment = 0.0
self.cats = {} self.cats = {}
self.user_hooks = {} self.user_hooks = {}
@ -221,11 +216,6 @@ cdef class Doc:
else: else:
lexeme = self.vocab.get_by_orth(self.mem, word) lexeme = self.vocab.get_by_orth(self.mem, word)
self.push_back(lexeme, has_space) self.push_back(lexeme, has_space)
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
# There's no information we'd like to add to it, so I guess so?
if self.length == 0:
self.is_tagged = True
self.is_parsed = True
@property @property
def _(self): def _(self):
@ -233,37 +223,61 @@ cdef class Doc:
return Underscore(Underscore.doc_extensions, self) return Underscore(Underscore.doc_extensions, self)
@property @property
def is_sentenced(self): def is_tagged(self):
"""Check if the document has sentence boundaries assigned. This is warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
defined as having at least one of the following: return self.has_annotation("TAG")
a) An entry "sents" in doc.user_hooks"; @property
b) Doc.is_parsed is set to True; def is_parsed(self):
c) At least one token other than the first where sent_start is not None. warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
""" return self.has_annotation("DEP")
if "sents" in self.user_hooks:
return True
if self.is_parsed:
return True
if len(self) < 2:
return True
for i in range(1, self.length):
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
return True
return False
@property @property
def is_nered(self): def is_nered(self):
"""Check if the document has named entities set. Will return True if warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
*any* of the tokens has a named entity tag set (even if the others are return self.has_annotation("ENT_IOB")
unknown values), or if the document is empty.
@property
def is_sentenced(self):
warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
return self.has_annotation("SENT_START")
def has_annotation(self, attr, *, require_complete=False):
"""Check whether the doc contains annotation on a token attribute.
attr (Union[int, str]): The attribute string name or int ID.
require_complete (bool): Whether to check that the attribute is set on
every token in the doc.
RETURNS (bool): Whether annotation is present.
DOCS: https://nightly.spacy.io/api/doc#has_annotation
""" """
if len(self) == 0:
# empty docs are always annotated
if self.length == 0:
return True return True
for i in range(self.length): cdef int i
if self.c[i].ent_iob != 0: cdef int range_start = 0
attr = intify_attr(attr)
# adjust attributes
if attr == HEAD:
# HEAD does not have an unset state, so rely on DEP
attr = DEP
elif attr == self.vocab.strings["IS_SENT_START"]:
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
attr = SENT_START
# special cases for sentence boundaries
if attr == SENT_START:
if "sents" in self.user_hooks:
return True return True
return False # docs of length 1 always have sentence boundaries
if self.length == 1:
return True
range_start = 1
if require_complete:
return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
else:
return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or `Span` object. """Get a `Token` or `Span` object.
@ -291,7 +305,7 @@ cdef class Doc:
DOCS: https://nightly.spacy.io/api/doc#getitem DOCS: https://nightly.spacy.io/api/doc#getitem
""" """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self, start, stop, label=0) return Span(self, start, stop, label=0)
if i < 0: if i < 0:
i = self.length + i i = self.length + i
@ -627,16 +641,13 @@ cdef class Doc:
@property @property
def sents(self): def sents(self):
"""Iterate over the sentences in the document. Yields sentence `Span` """Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label. To improve accuracy on informal objects. Sentence spans have no label.
texts, spaCy calculates sentence boundaries from the syntactic
dependency parse. If the parser is disabled, the `sents` iterator will
be unavailable.
YIELDS (Span): Sentences in the document. YIELDS (Span): Sentences in the document.
DOCS: https://nightly.spacy.io/api/doc#sents DOCS: https://nightly.spacy.io/api/doc#sents
""" """
if not self.is_sentenced: if not self.has_annotation("SENT_START"):
raise ValueError(Errors.E030) raise ValueError(Errors.E030)
if "sents" in self.user_hooks: if "sents" in self.user_hooks:
yield from self.user_hooks["sents"](self) yield from self.user_hooks["sents"](self)
@ -660,10 +671,6 @@ cdef class Doc:
return self.vocab.lang return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == 0:
# Flip these to false when we see the first token.
self.is_tagged = False
self.is_parsed = False
if self.length == self.max_length: if self.length == self.max_length:
self._realloc(self.length * 2) self._realloc(self.length * 2)
cdef TokenC* t = &self.c[self.length] cdef TokenC* t = &self.c[self.length]
@ -786,14 +793,6 @@ cdef class Doc:
for i in range(self.length, self.max_length + PADDING): for i in range(self.length, self.max_length + PADDING):
self.c[i].lex = &EMPTY_LEXEME self.c[i].lex = &EMPTY_LEXEME
cdef void set_parse(self, const TokenC* parsed) nogil:
# TODO: This method is fairly misleading atm. It's used by Parser
# to actually apply the parse calculated. Need to rethink this.
# Probably we should use from_array?
self.is_parsed = True
for i in range(self.length):
self.c[i] = parsed[i]
def from_array(self, attrs, array): def from_array(self, attrs, array):
"""Load attributes from a numpy array. Write to a `Doc` object, from an """Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes. `(M, N)` array of attributes.
@ -818,8 +817,8 @@ cdef class Doc:
if array.dtype != numpy.uint64: if array.dtype != numpy.uint64:
warnings.warn(Warnings.W028.format(type=array.dtype)) warnings.warn(Warnings.W028.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs: if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032) warnings.warn(Warnings.W106)
cdef int i, col cdef int i, col
cdef int32_t abs_head_index cdef int32_t abs_head_index
cdef attr_id_t attr_id cdef attr_id_t attr_id
@ -879,18 +878,17 @@ cdef class Doc:
# add morph to morphology table # add morph to morphology table
self.vocab.morphology.add(self.vocab.strings[value]) self.vocab.morphology.add(self.vocab.strings[value])
Token.set_struct_attr(token, attr_ids[j], value) Token.set_struct_attr(token, attr_ids[j], value)
# Set flags # If document is parsed, set children and sentence boundaries
self.is_parsed = bool(self.is_parsed or HEAD in attrs) if HEAD in attrs and DEP in attrs:
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) col = attrs.index(DEP)
# If document is parsed, set children if array[:, col].any():
if self.is_parsed: set_children_from_heads(self.c, 0, length)
set_children_from_heads(self.c, length)
return self return self
@staticmethod @staticmethod
def from_docs(docs, ensure_whitespace=True, attrs=None): def from_docs(docs, ensure_whitespace=True, attrs=None):
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share """Concatenate multiple Doc objects to form a new one. Raises an error
the same `Vocab`. if the `Doc` objects do not all share the same `Vocab`.
docs (list): A list of Doc objects. docs (list): A list of Doc objects.
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
@ -908,16 +906,7 @@ cdef class Doc:
(vocab,) = vocab (vocab,) = vocab
if attrs is None: if attrs is None:
attrs = [LEMMA, NORM] attrs = Doc._get_array_attrs()
if all(doc.is_nered for doc in docs):
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
# TODO: separate for is_morphed?
if all(doc.is_tagged for doc in docs):
attrs.extend([TAG, POS, MORPH])
if all(doc.is_parsed for doc in docs):
attrs.extend([HEAD, DEP])
else:
attrs.append(SENT_START)
else: else:
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
@ -989,9 +978,6 @@ cdef class Doc:
other.tensor = copy.deepcopy(self.tensor) other.tensor = copy.deepcopy(self.tensor)
other.cats = copy.deepcopy(self.cats) other.cats = copy.deepcopy(self.cats)
other.user_data = copy.deepcopy(self.user_data) other.user_data = copy.deepcopy(self.user_data)
other.is_tagged = self.is_tagged
other.is_parsed = self.is_parsed
other.is_morphed = self.is_morphed
other.sentiment = self.sentiment other.sentiment = self.sentiment
other.has_unknown_spaces = self.has_unknown_spaces other.has_unknown_spaces = self.has_unknown_spaces
other.user_hooks = dict(self.user_hooks) other.user_hooks = dict(self.user_hooks)
@ -1065,22 +1051,16 @@ cdef class Doc:
DOCS: https://nightly.spacy.io/api/doc#to_bytes DOCS: https://nightly.spacy.io/api/doc#to_bytes
""" """
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID] array_head = Doc._get_array_attrs()
if self.is_tagged:
array_head.extend([TAG, POS])
# If doc parsed add head and dep attribute
if self.is_parsed:
array_head.extend([HEAD, DEP])
# Otherwise add sent_start
else:
array_head.append(SENT_START)
strings = set() strings = set()
for token in self: for token in self:
strings.add(token.tag_) strings.add(token.tag_)
strings.add(token.lemma_) strings.add(token.lemma_)
strings.add(token.morph_)
strings.add(token.dep_) strings.add(token.dep_)
strings.add(token.ent_type_) strings.add(token.ent_type_)
strings.add(token.ent_kb_id_) strings.add(token.ent_kb_id_)
strings.add(token.ent_id_)
strings.add(token.norm_) strings.add(token.norm_)
# Msgpack doesn't distinguish between lists and tuples, which is # Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within # vexing for user data. As a best guess, we *know* that within
@ -1230,22 +1210,29 @@ cdef class Doc:
DOCS: https://nightly.spacy.io/api/doc#to_json DOCS: https://nightly.spacy.io/api/doc#to_json
""" """
data = {"text": self.text} data = {"text": self.text}
if self.is_nered: if self.has_annotation("ENT_IOB"):
data["ents"] = [{"start": ent.start_char, "end": ent.end_char, data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
"label": ent.label_} for ent in self.ents] "label": ent.label_} for ent in self.ents]
if self.is_sentenced: if self.has_annotation("SENT_START"):
sents = list(self.sents) sents = list(self.sents)
data["sents"] = [{"start": sent.start_char, "end": sent.end_char} data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
for sent in sents] for sent in sents]
if self.cats: if self.cats:
data["cats"] = self.cats data["cats"] = self.cats
data["tokens"] = [] data["tokens"] = []
attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
for token in self: for token in self:
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
if self.is_tagged: if include_annotation["TAG"]:
token_data["pos"] = token.pos_
token_data["tag"] = token.tag_ token_data["tag"] = token.tag_
if self.is_parsed: if include_annotation["POS"]:
token_data["pos"] = token.pos_
if include_annotation["MORPH"]:
token_data["morph"] = token.morph_
if include_annotation["LEMMA"]:
token_data["lemma"] = token.lemma_
if include_annotation["DEP"]:
token_data["dep"] = token.dep_ token_data["dep"] = token.dep_
token_data["head"] = token.head.i token_data["head"] = token.head.i
data["tokens"].append(token_data) data["tokens"].append(token_data)
@ -1291,6 +1278,12 @@ cdef class Doc:
j += 1 j += 1
return output return output
@staticmethod
def _get_array_attrs():
attrs = [LENGTH, SPACY]
attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
return tuple(attrs)
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i = token_by_char(tokens, length, start_char) cdef int i = token_by_char(tokens, length, start_char)
@ -1321,13 +1314,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
return mid return mid
return -1 return -1
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
cdef int set_children_from_heads(TokenC* tokens, int length) except -1: # note: end is exclusive
cdef TokenC* head cdef TokenC* head
cdef TokenC* child cdef TokenC* child
cdef int i cdef int i
# Set number of left/right children to 0. We'll increment it in the loops. # Set number of left/right children to 0. We'll increment it in the loops.
for i in range(length): for i in range(start, end):
tokens[i].l_kids = 0 tokens[i].l_kids = 0
tokens[i].r_kids = 0 tokens[i].r_kids = 0
tokens[i].l_edge = i tokens[i].l_edge = i
@ -1341,38 +1334,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
# without risking getting stuck in an infinite loop if something is # without risking getting stuck in an infinite loop if something is
# terribly malformed. # terribly malformed.
while not heads_within_sents: while not heads_within_sents:
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
if loop_count > 10: if loop_count > 10:
warnings.warn(Warnings.W026) warnings.warn(Warnings.W026)
break break
loop_count += 1 loop_count += 1
# Set sentence starts # Set sentence starts
for i in range(length): for i in range(start, end):
if tokens[i].head == 0 and tokens[i].dep != 0: tokens[i].sent_start = -1
tokens[tokens[i].l_edge].sent_start = True for i in range(start, end):
if tokens[i].head == 0:
tokens[tokens[i].l_edge].sent_start = 1
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1: cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
# May be called multiple times due to non-projectivity. See issues #3170 # May be called multiple times due to non-projectivity. See issues #3170
# and #4688. # and #4688.
# Set left edges # Set left edges
cdef TokenC* head cdef TokenC* head
cdef TokenC* child cdef TokenC* child
cdef int i, j cdef int i, j
for i in range(length): for i in range(start, end):
child = &tokens[i] child = &tokens[i]
head = &tokens[i + child.head] head = &tokens[i + child.head]
if child < head and loop_count == 0: if loop_count == 0 and child < head:
head.l_kids += 1 head.l_kids += 1
if child.l_edge < head.l_edge: if child.l_edge < head.l_edge:
head.l_edge = child.l_edge head.l_edge = child.l_edge
if child.r_edge > head.r_edge: if child.r_edge > head.r_edge:
head.r_edge = child.r_edge head.r_edge = child.r_edge
# Set right edges - same as above, but iterate in reverse # Set right edges - same as above, but iterate in reverse
for i in range(length-1, -1, -1): for i in range(end-1, start-1, -1):
child = &tokens[i] child = &tokens[i]
head = &tokens[i + child.head] head = &tokens[i + child.head]
if child > head and loop_count == 0: if loop_count == 0 and child > head:
head.r_kids += 1 head.r_kids += 1
if child.r_edge > head.r_edge: if child.r_edge > head.r_edge:
head.r_edge = child.r_edge head.r_edge = child.r_edge
@ -1380,14 +1375,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
head.l_edge = child.l_edge head.l_edge = child.l_edge
# Get sentence start positions according to current state # Get sentence start positions according to current state
sent_starts = set() sent_starts = set()
for i in range(length): for i in range(start, end):
if tokens[i].head == 0 and tokens[i].dep != 0: if tokens[i].head == 0:
sent_starts.add(tokens[i].l_edge) sent_starts.add(tokens[i].l_edge)
cdef int curr_sent_start = 0 cdef int curr_sent_start = 0
cdef int curr_sent_end = 0 cdef int curr_sent_end = 0
# Check whether any heads are not within the current sentence # Check whether any heads are not within the current sentence
for i in range(length): for i in range(start, end):
if (i > 0 and i in sent_starts) or i == length - 1: if (i > 0 and i in sent_starts) or i == end - 1:
curr_sent_end = i curr_sent_end = i
for j in range(curr_sent_start, curr_sent_end): for j in range(curr_sent_start, curr_sent_end):
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1: if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
@ -1436,6 +1431,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
with shape (n, n), where n = len(doc). with shape (n, n), where n = len(doc).
""" """
cdef int [:,:] lca_matrix cdef int [:,:] lca_matrix
cdef int j, k
n_tokens= end - start n_tokens= end - start
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
lca_mat.fill(-1) lca_mat.fill(-1)

View File

@ -4,13 +4,10 @@ cimport numpy as np
from libc.math cimport sqrt from libc.math cimport sqrt
import numpy import numpy
import numpy.linalg
from thinc.api import get_array_module from thinc.api import get_array_module
from collections import defaultdict
import warnings import warnings
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
from .token cimport TokenC
from ..structs cimport TokenC, LexemeC from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t, hash_t from ..typedefs cimport flags_t, attr_t, hash_t
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
@ -204,7 +201,7 @@ cdef class Span:
return Underscore(Underscore.span_extensions, self, return Underscore(Underscore.span_extensions, self,
start=self.start_char, end=self.end_char) start=self.start_char, end=self.end_char)
def as_doc(self, bint copy_user_data=False): def as_doc(self, *, bint copy_user_data=False):
"""Create a `Doc` object with a copy of the `Span`'s data. """Create a `Doc` object with a copy of the `Span`'s data.
copy_user_data (bool): Whether or not to copy the original doc's user data. copy_user_data (bool): Whether or not to copy the original doc's user data.
@ -212,19 +209,10 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#as_doc DOCS: https://nightly.spacy.io/api/span#as_doc
""" """
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
words = [t.text for t in self] words = [t.text for t in self]
spaces = [bool(t.whitespace_) for t in self] spaces = [bool(t.whitespace_) for t in self]
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID] array_head = self.doc._get_array_attrs()
if self.doc.is_tagged:
array_head.append(TAG)
# If doc parsed add head and dep attribute
if self.doc.is_parsed:
array_head.extend([HEAD, DEP])
# Otherwise add sent_start
else:
array_head.append(SENT_START)
array = self.doc.to_array(array_head) array = self.doc.to_array(array_head)
array = array[self.start : self.end] array = array[self.start : self.end]
self._fix_dep_copy(array_head, array) self._fix_dep_copy(array_head, array)
@ -378,7 +366,7 @@ cdef class Span:
self.doc.sents self.doc.sents
# Use `sent_start` token attribute to find sentence boundaries # Use `sent_start` token attribute to find sentence boundaries
cdef int n = 0 cdef int n = 0
if self.doc.is_sentenced: if self.doc.has_annotation("SENT_START"):
# Find start of the sentence # Find start of the sentence
start = self.start start = self.start
while self.doc.c[start].sent_start != 1 and start > 0: while self.doc.c[start].sent_start != 1 and start > 0:
@ -510,8 +498,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#noun_chunks DOCS: https://nightly.spacy.io/api/span#noun_chunks
""" """
if not self.doc.is_parsed:
raise ValueError(Errors.E029)
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us # prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration. The tricky thing here is that Span accepts

View File

@ -1,6 +1,4 @@
# cython: infer_types=True # cython: infer_types=True
from libc.string cimport memcpy
from cpython.mem cimport PyMem_Malloc, PyMem_Free
# Compiler crashes on memory view coercion without this. Should report bug. # Compiler crashes on memory view coercion without this. Should report bug.
from cython.view cimport array as cvarray from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..symbols cimport conj from ..symbols cimport conj
from .morphanalysis cimport MorphAnalysis from .morphanalysis cimport MorphAnalysis
from .doc cimport set_children_from_heads
from .. import parts_of_speech from .. import parts_of_speech
from .. import util
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -489,7 +486,7 @@ cdef class Token:
return True return True
def __set__(self, value): def __set__(self, value):
if self.doc.is_parsed: if self.doc.has_annotation("DEP"):
raise ValueError(Errors.E043) raise ValueError(Errors.E043)
if value is None: if value is None:
self.c.sent_start = 0 self.c.sent_start = 0
@ -658,78 +655,19 @@ cdef class Token:
# Do nothing if old head is new head # Do nothing if old head is new head
if self.i + self.c.head == new_head.i: if self.i + self.c.head == new_head.i:
return return
cdef Token old_head = self.head # Find the widest l/r_edges of the roots of the two tokens involved
cdef int rel_newhead_i = new_head.i - self.i # to limit the number of tokens for set_children_from_heads
# Is the new head a descendant of the old head cdef Token self_root, new_head_root
cdef bint is_desc = old_head.is_ancestor(new_head) self_ancestors = list(self.ancestors)
cdef int new_edge new_head_ancestors = list(new_head.ancestors)
cdef Token anc, child self_root = self_ancestors[-1] if self_ancestors else self
# Update number of deps of old head new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
if self.c.head > 0: # left dependent start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
old_head.c.l_kids -= 1 end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
if self.c.l_edge == old_head.c.l_edge:
# The token dominates the left edge so the left edge of
# the head may change when the token is reattached, it may
# not change if the new head is a descendant of the current
# head.
new_edge = self.c.l_edge
# The new l_edge is the left-most l_edge on any of the
# other dependents where the l_edge is left of the head,
# otherwise it is the head
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
if child == self:
continue
if child.c.l_edge < new_edge:
new_edge = child.c.l_edge
old_head.c.l_edge = new_edge
# Walk up the tree from old_head and assign new l_edge to
# ancestors until an ancestor already has an l_edge that's
# further left
for anc in old_head.ancestors:
if anc.c.l_edge <= new_edge:
break
anc.c.l_edge = new_edge
elif self.c.head < 0: # right dependent
old_head.c.r_kids -= 1
# Do the same thing as for l_edge
if self.c.r_edge == old_head.c.r_edge:
new_edge = self.c.r_edge
if not is_desc:
new_edge = old_head.i
for child in old_head.children:
if child == self:
continue
if child.c.r_edge > new_edge:
new_edge = child.c.r_edge
old_head.c.r_edge = new_edge
for anc in old_head.ancestors:
if anc.c.r_edge >= new_edge:
break
anc.c.r_edge = new_edge
# Update number of deps of new head
if rel_newhead_i > 0: # left dependent
new_head.c.l_kids += 1
# Walk up the tree from new head and set l_edge to self.l_edge
# until you hit a token with an l_edge further to the left
if self.c.l_edge < new_head.c.l_edge:
new_head.c.l_edge = self.c.l_edge
for anc in new_head.ancestors:
if anc.c.l_edge <= self.c.l_edge:
break
anc.c.l_edge = self.c.l_edge
elif rel_newhead_i < 0: # right dependent
new_head.c.r_kids += 1
# Do the same as for l_edge
if self.c.r_edge > new_head.c.r_edge:
new_head.c.r_edge = self.c.r_edge
for anc in new_head.ancestors:
if anc.c.r_edge >= self.c.r_edge:
break
anc.c.r_edge = self.c.r_edge
# Set new head # Set new head
self.c.head = rel_newhead_i self.c.head = new_head.i - self.i
# Adjust parse properties and sentence starts
set_children_from_heads(self.doc.c, start, end + 1)
@property @property
def conjuncts(self): def conjuncts(self):

View File

@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
doc[i]._.merged_spaceafter = spaces[i] doc[i]._.merged_spaceafter = spaces[i]
ents = get_entities(lines, ner_tag_pattern, ner_map) ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = spans_from_biluo_tags(doc, ents) doc.ents = spans_from_biluo_tags(doc, ents)
doc.is_parsed = True
doc.is_tagged = True
if merge_subtokens: if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc) doc = merge_conllu_subtokens(lines, doc)
@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
doc_x[i].dep_ = deps[i] doc_x[i].dep_ = deps[i]
doc_x[i].head = doc_x[heads[i]] doc_x[i].head = doc_x[heads[i]]
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
doc_x.is_parsed = True
doc_x.is_tagged = True
return doc_x return doc_x

View File

@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict) json_para["links"].append(link_dict)
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []} json_sent = {"tokens": [], "brackets": []}
for token in sent: for token in sent:
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
if doc.is_tagged: if include_annotation["TAG"]:
json_token["tag"] = token.tag_ json_token["tag"] = token.tag_
if include_annotation["POS"]:
json_token["pos"] = token.pos_ json_token["pos"] = token.pos_
if include_annotation["MORPH"]:
json_token["morph"] = token.morph_ json_token["morph"] = token.morph_
if include_annotation["LEMMA"]:
json_token["lemma"] = token.lemma_ json_token["lemma"] = token.lemma_
if doc.is_parsed: if include_annotation["DEP"]:
json_token["head"] = token.head.i-token.i json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_ json_token["dep"] = token.dep_
json_token["ner"] = biluo_tags[token.i] if include_annotation["ENT_IOB"]:
json_token["ner"] = biluo_tags[token.i]
json_sent["tokens"].append(json_token) json_sent["tokens"].append(json_token)
json_para["sentences"].append(json_sent) json_para["sentences"].append(json_sent)
json_doc["paragraphs"].append(json_para) json_doc["paragraphs"].append(json_para)

View File

@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
| ----------- | -------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------- |
| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ | | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
## Doc.has_annotation {#has_annotation tag="method"}
Check whether the doc contains annotation on a token attribute.
| Name | Description |
| ------------------ | --------------------------------------------------------------------------------------------------- |
| `attr` | The attribute string name or int ID. ~~Union[int, str]~~ |
| _keyword-only_ | |
| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
| **RETURNS** | Whether specified annotation is present in the doc. ~~bool~~ |
## Doc.to_array {#to_array tag="method"} ## Doc.to_array {#to_array tag="method"}
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Description | | Name | Description |
| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | A string representation of the document text. ~~str~~ | | `text` | A string representation of the document text. ~~str~~ |
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | | `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | | `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
| `vocab` | The store of lexical types. ~~Vocab~~ | | `vocab` | The store of lexical types. ~~Vocab~~ |
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ | | `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ | | `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ | | `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ | | `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ | | `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
@ -805,6 +806,25 @@ nlp = spacy.blank("en")
+ ruler.load_from_tag_map(YOUR_TAG_MAP) + ruler.load_from_tag_map(YOUR_TAG_MAP)
``` ```
### Migrating Doc flags {#migrating-doc-flags}
The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
`Doc.is_sentenced` are deprecated in v3 and replaced by
[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
token attribute symbols (the same symbols used in `Matcher` patterns):
```diff
doc = nlp(text)
- doc.is_parsed
+ doc.has_annotation("DEP")
- doc.is_tagged
+ doc.has_annotation("TAG")
- doc.is_sentenced
+ doc.has_annotation("SENT_START")
- doc.is_nered
+ doc.has_annotation("ENT_IOB")
```
### Training pipelines and models {#migrating-training} ### Training pipelines and models {#migrating-training}
To train your pipelines, you should now pretty much always use the To train your pipelines, you should now pretty much always use the