diff --git a/spacy/about.py b/spacy/about.py index 4ed3dd327..4fb6dfff1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a18" +__version__ = "3.0.0a19" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 0e80c3b5f..48229572b 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: RETURNS (dict): Generated dependency parse keyed by words and arcs. """ doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"])) - if not doc.is_parsed: + if not doc.has_annotation("DEP"): warnings.warn(Warnings.W005) if options.get("collapse_phrases", False): with doc.retokenize() as retokenizer: diff --git a/spacy/errors.py b/spacy/errors.py index 3bdeeccbe..173aedab9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -119,6 +119,11 @@ class Warnings: W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " "need to match on a stream of documents, you can use nlp.pipe and " "call the {matcher} on each Doc object.") + W106 = ("Both HEAD and SENT_START are included as attributes in " + "doc.from_array(). The parse trees based on the HEAD attribute " + "will override the values in SENT_START.") + W107 = ("The property Doc.{prop} is deprecated. Use " + "Doc.has_annotation(\"{attr}\") instead.") @add_codes @@ -192,11 +197,6 @@ class Errors: "Alternatively, add the dependency parser, or set sentence " "boundaries by setting doc[i].is_sent_start.") E031 = ("Invalid token: empty string ('') at position {i}.") - E032 = ("Conflicting attributes specified in doc.from_array(): " - "(HEAD, SENT_START). The HEAD attribute currently sets sentence " - "boundaries implicitly, based on the tree structure. This means " - "the HEAD attribute would potentially override the sentence " - "boundaries set by SENT_START.") E033 = ("Cannot load into non-empty Doc of length {length}.") E035 = ("Error creating span with start {start} and end {end} for Doc of " "length {length}.") @@ -397,8 +397,8 @@ class Errors: E154 = ("One of the attributes or values is not supported for token " "patterns. Please use the option validate=True with Matcher, " "PhraseMatcher, or EntityRuler for more details.") - E155 = ("The pipeline needs to include a tagger in order to use " - "Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. " + E155 = ("The pipeline needs to include a {pipe} in order to use " + "Matcher or PhraseMatcher with the attribute {attr}. " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "instead of list(nlp.tokenizer.pipe()).") E156 = ("The pipeline needs to include a parser in order to use " diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 6c1d66cba..270185a4b 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,7 +1,11 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class BengaliDefaults(Language.Defaults): @@ -17,4 +21,22 @@ class Bengali(Language): Defaults = BengaliDefaults +@Bengali.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Bengali"] diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index bd495f792..bd75a61eb 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_label = doc.vocab.strings.add("NP") np_deps = set(doc.vocab.strings.add(label) for label in labels) diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 0a13edcc0..89cfd8b72 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: # Further improvement of the models will eliminate the need for this tag. labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"] doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index 59ae733bd..2a1b0867e 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings.add(label) for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 427f1f203..ad0a1b838 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" doc = doclike.doc - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) if not len(doc): return diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 7fdb9d065..244534120 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,9 +1,13 @@ +from typing import Optional +from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class PersianDefaults(Language.Defaults): @@ -20,4 +24,22 @@ class Persian(Language): Defaults = PersianDefaults +@Persian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Persian"] diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index b63db3539..0be06e73c 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -19,7 +19,7 @@ def noun_chunks(doclike): ] doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings.add(label) for label in labels] diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index d297203e3..68117a54d 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index f6d261643..0f29bfe16 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index d2bb92072..28a2f0bf2 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,9 +1,13 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer class NorwegianDefaults(Language.Defaults): @@ -20,4 +24,22 @@ class Norwegian(Language): Defaults = NorwegianDefaults +@Norwegian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Norwegian"] diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index d297203e3..68117a54d 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 0c6a1b9f4..6db74cd39 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,8 +1,13 @@ +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from ...lookups import Lookups +from ...pipeline import Lemmatizer + # Punctuation stolen from Danish from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES @@ -22,4 +27,22 @@ class Swedish(Language): Defaults = SwedishDefaults +@Swedish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Swedish"] diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 662b508ed..d5ae47853 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] # fmt: on doc = doclike.doc # Ensure works on both Doc and Span. - if not doc.is_parsed: + if not doc.has_annotation("DEP"): raise ValueError(Errors.E029) np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings.add("conj") diff --git a/spacy/language.py b/spacy/language.py index 8f7cb1973..d530e6b92 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,7 +8,7 @@ from contextlib import contextmanager from copy import deepcopy from pathlib import Path import warnings -from thinc.api import get_current_ops, Config, require_gpu, Optimizer +from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle @@ -1448,10 +1448,15 @@ class Language: """Register 'listeners' within pipeline components, to allow them to effectively share weights. """ + # I had though, "Why do we do this inside the Language object? Shouldn't + # it be the tok2vec/transformer/etc's job? + # The problem is we need to do it during deserialization...And the + # components don't receive the pipeline then. So this does have to be + # here :( for i, (name1, proc1) in enumerate(self.pipeline): if hasattr(proc1, "find_listeners"): - for name2, proc2 in self.pipeline[i:]: - if hasattr(proc2, "model"): + for name2, proc2 in self.pipeline[i+1:]: + if isinstance(getattr(proc2, "model", None), Model): proc1.find_listeners(proc2.model) @classmethod diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 079cac788..d83f58181 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -17,7 +17,7 @@ from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -215,10 +215,15 @@ cdef class Matcher: else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) cdef Pool tmp_pool = Pool() - if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ - and not doc.is_tagged: - raise ValueError(Errors.E155.format()) - if DEP in self._seen_attrs and not doc.is_parsed: + if TAG in self._seen_attrs and not doc.has_annotation("TAG"): + raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) + if POS in self._seen_attrs and not doc.has_annotation("POS"): + raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) + if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"): + raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) + if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"): + raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) + if DEP in self._seen_attrs and not doc.has_annotation("DEP"): raise ValueError(Errors.E156.format()) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index fae513367..b00ba157f 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter import warnings -from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA +from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH from ..structs cimport TokenC from ..tokens.token cimport Token from ..tokens.span cimport Span @@ -184,12 +184,20 @@ cdef class PhraseMatcher: if len(doc) == 0: continue if isinstance(doc, Doc): - if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged: - raise ValueError(Errors.E155.format()) - if self.attr == DEP and not doc.is_parsed: + attrs = (TAG, POS, MORPH, LEMMA, DEP) + has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} + if self.attr == TAG and not has_annotation[TAG]: + raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) + if self.attr == POS and not has_annotation[POS]: + raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) + if self.attr == MORPH and not has_annotation[MORPH]: + raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) + if self.attr == LEMMA and not has_annotation[LEMMA]: + raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) + if self.attr == DEP and not has_annotation[DEP]: raise ValueError(Errors.E156.format()) - if self._validate and (doc.is_tagged or doc.is_parsed) \ - and self.attr not in (DEP, POS, TAG, LEMMA): + if self._validate and any(has_annotation.values()) \ + and self.attr not in attrs: string_attr = self.vocab.strings[self.attr] warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) keyword = self._convert_to_array(doc) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 2e5f8a802..7ced4bd04 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -164,7 +164,7 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(width: int, rows: int, nM: int, nC: int): +def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool): """Construct an embedded representation based on character embeddings, using a feed-forward network. A fixed number of UTF-8 byte characters are used for each word, taken from the beginning and end of the word equally. Padding is @@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int): nC (int): The number of UTF-8 bytes to embed per word. Recommended values are between 3 and 8, although it may depend on the length of words in the language. + also_use_static_vectors (bool): Whether to also use static word vectors. + Requires a vectors table to be loaded in the Doc objects' vocab. """ - model = chain( - concatenate( - chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), - chain( - FeatureExtractor([NORM]), - list2ragged(), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), + if also_use_static_vectors: + model = chain( + concatenate( + chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), + chain( + FeatureExtractor([NORM]), + list2ragged(), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), + ), + StaticVectors(width, dropout=0.0), ), - ), - with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), - ragged2list(), + with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)), + ragged2list(), + ) + else: + model = chain( + concatenate( + chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), + chain( + FeatureExtractor([NORM]), + list2ragged(), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)), + ), + ), + with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), + ragged2list(), ) return model diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index bb0bf35b8..dafa99bdd 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -679,8 +679,7 @@ cdef class ArcEager(TransitionSystem): st._sent[i].dep = self.root_label def finalize_doc(self, Doc doc): - doc.is_parsed = True - set_children_from_heads(doc.c, doc.length) + set_children_from_heads(doc.c, 0, doc.length) def has_gold(self, Example eg, start=0, end=None): for word in eg.y[start:end]: diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 8f5fdaa71..82070cd27 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc): new_head = _find_new_head(doc[i], head_label) doc.c[i].head = new_head.i - i doc.c[i].dep = doc.vocab.strings.add(new_label) - set_children_from_heads(doc.c, doc.length) + set_children_from_heads(doc.c, 0, doc.length) return doc diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 7e68ea369..614608b25 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc: DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks """ - if not doc.is_parsed: + if not doc.has_annotation("DEP"): return doc with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 57bdb28d7..62ad9e0eb 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -32,6 +32,7 @@ width = 128 rows = 7000 nM = 64 nC = 8 +also_use_static_vectors = false [model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" @@ -203,8 +204,6 @@ class Morphologizer(Tagger): doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph]) doc.c[j].pos = self.cfg["labels_pos"][morph] - doc.is_morphed = True - def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and their predicted scores. @@ -259,79 +258,3 @@ class Morphologizer(Tagger): results.update(Scorer.score_token_attr_per_feat(examples, "morph", **kwargs)) return results - - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - - DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes - """ - serialize = {} - serialize["model"] = self.model.to_bytes - serialize["vocab"] = self.vocab.to_bytes - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - bytes_data (bytes): The serialized pipe. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Morphologizer): The loaded Morphologizer. - - DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes - """ - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), - "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: load_model(b), - } - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, *, exclude=tuple()): - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://nightly.spacy.io/api/morphologizer#to_disk - """ - serialize = { - "vocab": lambda p: self.vocab.to_disk(p), - "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg), - } - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, *, exclude=tuple()): - """Load the pipe from disk. Modifies the object in place and returns it. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Morphologizer): The modified Morphologizer object. - - DOCS: https://nightly.spacy.io/api/morphologizer#from_disk - """ - def load_model(p): - with p.open("rb") as file_: - try: - self.model.from_bytes(file_.read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), - "cfg": lambda p: self.cfg.update(deserialize_config(p)), - "model": load_model, - } - util.from_disk(path, deserialize, exclude) - return self diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 00664131b..a7eb721fd 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger): results = Scorer.score_spans(examples, "sents", **kwargs) del results["sents_per_type"] return results - - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - - DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes - """ - serialize = {} - serialize["model"] = self.model.to_bytes - serialize["vocab"] = self.vocab.to_bytes - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - bytes_data (bytes): The serialized pipe. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Tagger): The loaded SentenceRecognizer. - - DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes - """ - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), - "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: load_model(b), - } - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, *, exclude=tuple()): - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk - """ - serialize = { - "vocab": lambda p: self.vocab.to_disk(p), - "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg), - } - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, *, exclude=tuple()): - """Load the pipe from disk. Modifies the object in place and returns it. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (Tagger): The modified SentenceRecognizer object. - - DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk - """ - def load_model(p): - with p.open("rb") as file_: - try: - self.model.from_bytes(file_.read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), - "cfg": lambda p: self.cfg.update(deserialize_config(p)), - "model": load_model, - } - util.from_disk(path, deserialize, exclude) - return self diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 1f8b4eb7a..0d78047ae 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -168,7 +168,6 @@ class Tagger(Pipe): # Don't clobber preset POS tags if doc.c[j].tag == 0: doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] - doc.is_tagged = True def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): """Learn from a batch of documents and gold-standard information, diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index b37a31e43..ce979d3d1 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text): tokens = en_tokenizer(text) tokens[0].lemma_ = "lemma" tokens[0].norm_ = "norm" + tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)] tokens[0].ent_kb_id_ = "ent_kb_id" new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) assert tokens.text == new_tokens.text @@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer): def test_doc_api_sents_empty_string(en_tokenizer): doc = en_tokenizer("") - doc.is_parsed = True sents = list(doc.sents) assert len(sents) == 0 @@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer): text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue." heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1, -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26] + deps = ["dep"] * len(heads) # fmt: on tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert doc[6].text == "for" subtree = [w.text for w in doc[6].subtree] # fmt: off @@ -240,7 +241,9 @@ def test_doc_api_similarity_match(): ) def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): tokens = en_tokenizer(sentence) - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + doc = get_doc( + tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads) + ) lca = doc.get_lca_matrix() assert (lca == lca_matrix).all() assert lca[1, 1] == 1 @@ -251,51 +254,55 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix): def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) - assert not doc.is_nered + assert not doc.has_annotation("ENT_IOB") doc.ents = [Span(doc, 3, 5, label="GPE")] - assert doc.is_nered + assert doc.has_annotation("ENT_IOB") # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) - assert doc.is_nered + assert doc.has_annotation("ENT_IOB") # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) - assert new_doc.is_nered + assert new_doc.has_annotation("ENT_IOB") def test_doc_from_array_sent_starts(en_vocab): words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] - heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] + heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3] # fmt: off - deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"] + deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"] # fmt: on - doc = Doc(en_vocab, words=words) - for i, (dep, head) in enumerate(zip(deps, heads)): - doc[i].dep_ = dep - doc[i].head = doc[head] - if head == i: - doc[i].is_sent_start = True - doc.is_parsed + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + # HEAD overrides SENT_START with warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) - with pytest.raises(ValueError): + with pytest.warns(UserWarning): new_doc.from_array(attrs, arr) - attrs = [SENT_START, DEP] + # no warning using default attrs + attrs = doc._get_array_attrs() + arr = doc.to_array(attrs) + with pytest.warns(None) as record: + new_doc.from_array(attrs, arr) + assert len(record) == 0 + + # only SENT_START uses SENT_START + attrs = [SENT_START] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] - assert not new_doc.is_parsed + assert not new_doc.has_annotation("DEP") + # only HEAD uses HEAD attrs = [HEAD, DEP] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] - assert new_doc.is_parsed + assert new_doc.has_annotation("DEP") def test_doc_from_array_morph(en_vocab): @@ -365,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - with pytest.raises(ValueError): - # important attributes from sentenziser or parser are missing - assert list(m_doc.sents) assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing assert str(m_doc) == " ".join(en_texts_without_empty) @@ -379,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx +def test_doc_api_from_docs_ents(en_tokenizer): + texts = ["Merging the docs is fun.", "They don't think alike."] + docs = [en_tokenizer(t) for t in texts] + docs[0].ents = () + docs[1].ents = (Span(docs[1], 0, 1, label="foo"),) + doc = Doc.from_docs(docs) + assert len(doc.ents) == 1 + + def test_doc_lang(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) assert doc.lang_ == "en" @@ -399,3 +412,45 @@ def test_token_lexeme(en_vocab): assert isinstance(token.lex, Lexeme) assert token.lex.text == token.text assert en_vocab[token.orth] == token.lex + + +def test_has_annotation(en_vocab): + doc = Doc(en_vocab, words=["Hello", "world"]) + attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") + for attr in attrs: + assert not doc.has_annotation(attr) + + doc[0].tag_ = "A" + doc[0].pos_ = "X" + doc[0].morph_ = "Feat=Val" + doc[0].lemma_ = "a" + doc[0].dep_ = "dep" + doc[0].head = doc[1] + doc.ents = [Span(doc, 0, 1, label="HELLO")] + + for attr in attrs: + assert doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + doc[1].tag_ = "A" + doc[1].pos_ = "X" + doc[1].morph_ = "" + doc[1].lemma_ = "a" + doc[1].dep_ = "dep" + doc.ents = [Span(doc, 0, 2, label="HELLO")] + + for attr in attrs: + assert doc.has_annotation(attr) + assert doc.has_annotation(attr, require_complete=True) + + +def test_is_flags_deprecated(en_tokenizer): + doc = en_tokenizer("test") + with pytest.deprecated_call(): + doc.is_tagged + with pytest.deprecated_call(): + doc.is_parsed + with pytest.deprecated_call(): + doc.is_nered + with pytest.deprecated_call(): + doc.is_sentenced diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 1e9623484..ad4f49042 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer): text = "This is a sentence. This is another sentence. And a third." tokens = en_tokenizer(text) doc = Doc(tokens.vocab, words=[t.text for t in tokens]) - doc.is_parsed = False return doc @@ -71,8 +70,9 @@ def test_spans_string_fn(doc): def test_spans_root2(en_tokenizer): text = "through North and South Carolina" heads = [0, 3, -1, -2, -4] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert doc[-2:].root.text == "Carolina" @@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed): def test_spans_lca_matrix(en_tokenizer): """Test span's lca matrix generation""" tokens = en_tokenizer("the lazy dog slept") - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0]) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4) lca = doc[:2].get_lca_matrix() assert lca.shape == (2, 2) assert lca[0, 0] == 0 # the & the -> the diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index be56c9b71..1308df67b 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -112,11 +112,11 @@ def test_doc_token_api_ancestors(en_tokenizer): def test_doc_token_api_head_setter(en_tokenizer): - # the structure of this sentence depends on the English annotation scheme text = "Yesterday I saw a dog that barked loudly." heads = [2, 1, 0, 1, -2, 1, -2, -1, -6] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert doc[6].n_lefts == 1 assert doc[6].n_rights == 1 @@ -169,13 +169,46 @@ def test_doc_token_api_head_setter(en_tokenizer): with pytest.raises(ValueError): doc[0].head = doc2[0] + # test sentence starts when two sentences are joined + text = "This is one sentence. This is another sentence." + heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4] + tokens = en_tokenizer(text) + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + deps=["dep"] * len(heads), + ) + # initially two sentences + assert doc[0].is_sent_start + assert doc[5].is_sent_start + assert doc[0].left_edge == doc[0] + assert doc[0].right_edge == doc[4] + assert doc[5].left_edge == doc[5] + assert doc[5].right_edge == doc[9] + + # modifying with a sentence doesn't change sent starts + doc[2].head = doc[3] + assert doc[0].is_sent_start + assert doc[5].is_sent_start + assert doc[0].left_edge == doc[0] + assert doc[0].right_edge == doc[4] + assert doc[5].left_edge == doc[5] + assert doc[5].right_edge == doc[9] + + # attach the second sentence to the first, resulting in one sentence + doc[5].head = doc[0] + assert doc[0].is_sent_start + assert not doc[5].is_sent_start + assert doc[0].left_edge == doc[0] + assert doc[0].right_edge == doc[9] + def test_is_sent_start(en_tokenizer): doc = en_tokenizer("This is a sentence. This is another.") assert doc[5].is_sent_start is None doc[5].is_sent_start = True assert doc[5].is_sent_start is True - doc.is_parsed = True assert len(list(doc.sents)) == 2 @@ -184,7 +217,6 @@ def test_is_sent_end(en_tokenizer): assert doc[4].is_sent_end is None doc[5].is_sent_start = True assert doc[4].is_sent_end is True - doc.is_parsed = True assert len(list(doc.sents)) == 2 @@ -209,14 +241,14 @@ def test_token0_has_sent_start_true(): doc = Doc(Vocab(), words=["hello", "world"]) assert doc[0].is_sent_start is True assert doc[1].is_sent_start is None - assert not doc.is_sentenced + assert not doc.has_annotation("SENT_START") def test_tokenlast_has_sent_end_true(): doc = Doc(Vocab(), words=["hello", "world"]) assert doc[0].is_sent_end is None assert doc[1].is_sent_end is True - assert not doc.is_sentenced + assert not doc.has_annotation("SENT_START") def test_token_api_conjuncts_chain(en_vocab): diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index ff9f8d5e5..0ed12d208 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_de(de_tokenizer): """Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = de_tokenizer("Er lag auf seinem") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index 38e72b0b2..2d376c612 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_el(el_tokenizer): """Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = el_tokenizer("είναι χώρα της νοτιοανατολικής") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 5395dbabe..fa3a134bd 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -11,12 +11,8 @@ from ...util import get_doc def test_noun_chunks_is_parsed(en_tokenizer): """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = en_tokenizer("This is a sentence") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 38c8d94d8..ee1e6be17 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence @pytest.mark.parametrize("punct", [".", "!", "?", ""]) def test_en_sbd_single_punct(en_tokenizer, text, punct): heads = [2, 1, 0, -1] if punct else [2, 1, 0] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text + punct) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert len(doc) == 4 if punct else 3 assert len(list(doc.sents)) == 1 assert sum(len(sent) for sent in doc.sents) == len(doc) diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index a7ec4e562..db89fd903 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_es(es_tokenizer): """Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = es_tokenizer("en Oxford este verano") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py index 767e91f6b..53b39d9a1 100644 --- a/spacy/tests/lang/fa/test_noun_chunks.py +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -3,12 +3,8 @@ import pytest def test_noun_chunks_is_parsed_fa(fa_tokenizer): """Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = fa_tokenizer("این یک جمله نمونه می باشد.") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index 5fd6897f7..d81199a3e 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_fr(fr_tokenizer): """Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = fr_tokenizer("trouver des travaux antérieurs") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index 445643933..fef1524f1 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_id(id_tokenizer): """Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = id_tokenizer("sebelas") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index c6a00354b..9965fcd14 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -3,11 +3,7 @@ import pytest def test_noun_chunks_is_parsed_nb(nb_tokenizer): """Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = nb_tokenizer("Smørsausen brukes bl.a. til") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index f352ca648..458cdadd5 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -5,12 +5,8 @@ from ...util import get_doc def test_noun_chunks_is_parsed_sv(sv_tokenizer): """Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed. - To check this test, we're constructing a Doc - with a new Vocab here and forcing is_parsed to 'False' - to make sure the noun chunks don't run. """ doc = sv_tokenizer("Studenten läste den bästa boken") - doc.is_parsed = False with pytest.raises(ValueError): list(doc.noun_chunks) diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 14c59659a..6e7f82341 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -8,7 +8,7 @@ from spacy.util import get_lang_class # Only include languages with no external dependencies # excluded: ru, uk # excluded for custom tables: pl -LANGUAGES = ["el", "en", "fr", "nl"] +LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e0f335a19..04f9585f1 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab): def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) - doc1.is_parsed = True + doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) - doc2.is_tagged = True + doc2[0].tag_ = "TAG" + doc2[0].pos_ = "X" + doc2[0].morph_ = "Feat=Val" + doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) - # DEP requires is_parsed + # DEP requires DEP matcher = Matcher(en_vocab) matcher.add("TEST", [[{"DEP": "a"}]]) matcher(doc1) @@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) - # TAG, POS, LEMMA require is_tagged + # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) matcher.add("TEST", [[{attr: "a"}]]) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 4b7027f87..9caf284a3 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab): def test_phrase_matcher_validation(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) - doc1.is_parsed = True + doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) - doc2.is_tagged = True + doc2[0].tag_ = "TAG" + doc2[0].pos_ = "X" + doc2[0].morph_ = "Feat=Val" doc3 = Doc(en_vocab, words=["Test"]) matcher = PhraseMatcher(en_vocab, validate=True) with pytest.warns(UserWarning): @@ -212,18 +214,21 @@ def test_attr_validation(en_vocab): def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) - doc1.is_parsed = True + doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) - doc2.is_tagged = True + doc2[0].tag_ = "TAG" + doc2[0].pos_ = "X" + doc2[0].morph_ = "Feat=Val" + doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) - # DEP requires is_parsed + # DEP requires DEP matcher = PhraseMatcher(en_vocab, attr="DEP") matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) - # TAG, POS, LEMMA require is_tagged + # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = PhraseMatcher(en_vocab, attr=attr) matcher.add("TEST2", [doc2]) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8d45e2132..9e760c1e7 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser): def test_parser_parse_subtrees(en_tokenizer, en_parser): text = "The four wheels on the bus turned quickly" heads = [2, 1, 4, -1, 1, -2, 0, -1] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert len(list(doc[2].lefts)) == 2 assert len(list(doc[2].rights)) == 1 assert len(list(doc[2].children)) == 3 @@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab): if i == 0 or i == 3: assert doc[i].is_sent_start is True else: - assert doc[i].is_sent_start is None + assert doc[i].is_sent_start is False for sent in doc.sents: for token in sent: assert token.head in sent diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index f42601a85..db1e98ba0 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)) lefts = {} rights = {} diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 3a0a6b943..3672dabea 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence def test_parser_space_attachment(en_tokenizer): text = "This is a test.\nTo ensure spaces are attached well." heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2] + deps = ["dep"] * len(heads) tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) + doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) for sent in doc.sents: if len(sent) == 1: assert not sent[-1].is_space diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index 9254688cc..a66b34bc0 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts): assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert doc.has_annotation("LEMMA") + assert doc.has_annotation("MORPH") def test_attributeruler_init_patterns(nlp, pattern_dicts): @@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert doc.has_annotation("LEMMA") + assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") # initialize with patterns from asset nlp.add_pipe( @@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc[2].morph_ == "Case=Nom|Number=Plur" assert doc[3].lemma_ == "cat" assert doc[3].morph_ == "Case=Nom|Number=Sing" + assert doc.has_annotation("LEMMA") + assert doc.has_annotation("MORPH") def test_attributeruler_score(nlp, pattern_dicts): diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 0ec8a5ec2..ee9e34df3 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -35,8 +35,6 @@ def doc2(en_tokenizer): deps=deps, ) doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])] - doc.is_parsed = True - doc.is_tagged = True return doc diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 1b1c51f34..5dd0fef43 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -9,7 +9,7 @@ def test_sentencizer(en_vocab): doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] @@ -22,13 +22,13 @@ def test_sentencizer_pipe(): nlp = English() nlp.add_pipe("sentencizer") for doc in nlp.pipe(texts): - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 for ex in nlp.pipe(texts): doc = ex.doc - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert len(list(doc.sents)) == 2 @@ -42,7 +42,7 @@ def test_sentencizer_empty_docs(): nlp.add_pipe("sentencizer") for texts in [one_empty_text, many_empty_texts, some_empty_texts]: for doc in nlp.pipe(texts): - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] if len(doc) == 0: assert sent_starts == [] @@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents @@ -115,7 +115,7 @@ def test_sentencizer_custom_punct( doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=punct_chars) doc = sentencizer(doc) - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 0365554bc..2e514f490 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): [ (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index ed5bcc1a5..30f66fb1d 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -94,7 +94,6 @@ def test_issue309(en_tokenizer): doc = get_doc( tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"] ) - doc.is_parsed = True assert len(doc) == 1 sents = list(doc.sents) assert len(sents) == 1 @@ -170,11 +169,9 @@ def test_issue595(): def test_issue599(en_vocab): doc = Doc(en_vocab) - doc.is_tagged = True - doc.is_parsed = True doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) - assert doc2.is_parsed + assert doc2.has_annotation("DEP") def test_issue600(): diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index c1d726db6..e226c8524 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token from spacy.attrs import HEAD, DEP from spacy.matcher import Matcher -from ..util import make_tempdir +from ..util import make_tempdir, get_doc def test_issue1506(): @@ -198,17 +198,26 @@ def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" - doc = Doc(Vocab(), words=string.split()) - doc[6].sent_start = True + words = string.split() + doc = get_doc(Vocab(), words=words) + doc[6].is_sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start - assert not new_doc.is_parsed - assert not new_doc.is_tagged - doc.is_parsed = True - doc.is_tagged = True + assert not new_doc.has_annotation("DEP") + assert not new_doc.has_annotation("TAG") + doc = get_doc( + Vocab(), + words=words, + tags=["TAG"] * len(words), + heads=[0, -1, -2, -3, -4, -5, 0, -1, -2], + deps=["dep"] * len(words), + ) + print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc]) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert new_doc.is_parsed - assert new_doc.is_tagged + print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc]) + assert new_doc[6].sent_start + assert new_doc.has_annotation("DEP") + assert new_doc.has_annotation("TAG") def test_issue1868(): diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 357fbb84e..3bea5d3f6 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -72,8 +72,6 @@ def test_issue2219(en_vocab): def test_issue2361(de_tokenizer): chars = ("<", ">", "&", """) doc = de_tokenizer('< > & " ') - doc.is_parsed = True - doc.is_tagged = True html = render(doc) for char in chars: assert char in html @@ -108,6 +106,7 @@ def test_issue2385_biluo(tags): def test_issue2396(en_vocab): words = ["She", "created", "a", "test", "for", "spacy"] heads = [1, 0, 1, -2, -1, -1] + deps = ["dep"] * len(heads) matrix = numpy.array( [ [0, 1, 1, 1, 1, 1], @@ -119,7 +118,7 @@ def test_issue2396(en_vocab): ], dtype=numpy.int32, ) - doc = get_doc(en_vocab, words=words, heads=heads) + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) span = doc[:] assert (doc.get_lca_matrix() == matrix).all() assert (span.get_lca_matrix() == matrix).all() diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index beb8faca1..9267a7346 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -16,16 +16,16 @@ from ..util import get_doc def test_issue2564(): - """Test the tagger sets is_tagged correctly when used via Language.pipe.""" + """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") nlp.begin_training() doc = nlp("hello world") - assert doc.is_tagged + assert doc.has_annotation("TAG") docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) - assert piped_doc.is_tagged + assert piped_doc.has_annotation("TAG") def test_issue2569(en_tokenizer): @@ -123,7 +123,7 @@ def test_issue2772(en_vocab): heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4] deps = ["dep"] * len(heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - assert doc[1].is_sent_start is None + assert doc[1].is_sent_start is False @pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 3059eb5ab..d848467dd 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -63,7 +63,7 @@ def test_issue3012(en_vocab): pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] ents = [(2, 4, "PERCENT")] doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) - assert doc.is_tagged + assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected @@ -83,10 +83,14 @@ def test_issue3012(en_vocab): def test_issue3199(): """Test that Span.noun_chunks works correctly if no noun chunks iterator is available. To make this test future-proof, we're constructing a Doc - with a new Vocab here and setting is_parsed to make sure the noun chunks run. + with a new Vocab here and a parse tree to make sure the noun chunks run. """ - doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) - doc.is_parsed = True + doc = get_doc( + Vocab(), + words=["This", "is", "a", "sentence"], + heads=[0, -1, -2, -3], + deps=["dep"] * 4, + ) assert list(doc[0:3].noun_chunks) == [] @@ -250,16 +254,16 @@ def test_issue3456(): def test_issue3468(): - """Test that sentence boundaries are set correctly so Doc.is_sentenced can + """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can be restored after serialization.""" nlp = English() nlp.add_pipe("sentencizer") doc = nlp("Hello world") assert doc[0].is_sent_start - assert doc.is_sentenced + assert doc.has_annotation("SENT_START") assert len(list(doc.sents)) == 1 doc_bytes = doc.to_bytes() new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) assert new_doc[0].is_sent_start - assert new_doc.is_sentenced + assert new_doc.has_annotation("SENT_START") assert len(list(new_doc.sents)) == 1 diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index d36e693c7..8c483d877 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -356,7 +356,6 @@ def test_issue3882(en_vocab): copy of the Doc. """ doc = Doc(en_vocab, words=["Hello", "world"]) - doc.is_parsed = True doc.user_data["test"] = set() parse_deps(doc) @@ -386,7 +385,6 @@ def test_issue3959(): doc[0].pos_ = "NOUN" assert doc[0].pos_ == "NOUN" # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True with make_tempdir() as tmp_dir: file_path = tmp_dir / "my_doc" doc.to_disk(file_path) diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 2beccedcf..4e58c347e 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -189,7 +189,6 @@ def test_issue4133(en_vocab): for i, token in enumerate(doc): token.pos_ = pos[i] # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True doc_bytes = doc.to_bytes() vocab = Vocab() vocab = vocab.from_bytes(vocab_bytes) @@ -249,7 +248,7 @@ def test_issue4267(): assert "ner" in nlp.pipe_names # assert that we have correct IOB annotations doc1 = nlp("hi") - assert doc1.is_nered + assert doc1.has_annotation("ENT_IOB") for token in doc1: assert token.ent_iob == 2 # add entity ruler and run again @@ -260,7 +259,7 @@ def test_issue4267(): assert "ner" in nlp.pipe_names # assert that we still have correct IOB annotations doc2 = nlp("hi") - assert doc2.is_nered + assert doc2.has_annotation("ENT_IOB") for token in doc2: assert token.ent_iob == 2 diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index fb96c0361..6e3604ce8 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -80,7 +80,6 @@ def tagged_doc(): doc[i].morph_ = morphs[i] if i > 0: doc[i].is_sent_start = False - doc.is_tagged = True return doc diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 67cc37b1c..1d3c72a8b 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -12,7 +12,7 @@ from thinc.api import compounding import pytest import srsly -from ..util import make_tempdir +from ..util import make_tempdir, get_doc @pytest.fixture @@ -26,24 +26,16 @@ def doc(): "NounType=prop|Number=sing", "PunctType=peri"] # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] + heads = [head - i for i, head in enumerate(heads)] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] - biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE")) cats = {"TRAVEL": 1.0, "BAKING": 0.0} # fmt: on nlp = English() - doc = nlp(text) - for i in range(len(tags)): - doc[i].tag_ = tags[i] - doc[i].pos_ = pos[i] - doc[i].morph_ = morphs[i] - doc[i].lemma_ = lemmas[i] - doc[i].dep_ = deps[i] - doc[i].head = doc[heads[i]] - doc.ents = spans_from_biluo_tags(doc, biluo_tags) + words = [t.text for t in nlp.make_doc(text)] + doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents) doc.cats = cats - doc.is_tagged = True - doc.is_parsed = True return doc @@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab): docs = json2docs(data) assert len(docs) == 1 for doc in docs: - assert not doc.is_nered + assert not doc.has_annotation("ENT_IOB") for token in doc: assert token.ent_iob == 0 eg = Example( diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 9323bb579..cd1e73a2b 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -274,7 +274,7 @@ def _merge(Doc doc, merges): for i in range(doc.length): doc.c[i].head -= i # Set the left/right children, left/right edges - set_children_from_heads(doc.c, doc.length) + set_children_from_heads(doc.c, 0, doc.length) # Make sure ent_iob remains consistent make_iob_consistent(doc.c, doc.length) # Return the merged Python object @@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): for i in range(doc.length): doc.c[i].head -= i # set children from head - set_children_from_heads(doc.c, doc.length) + set_children_from_heads(doc.c, 0, doc.length) def _validate_extensions(extensions): @@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length): def normalize_token_attrs(Vocab vocab, attrs): if "_" in attrs: # Extension attributes extensions = attrs["_"] - print("EXTENSIONS", extensions) _validate_extensions(extensions) attrs = {key: value for key, value in attrs.items() if key != "_"} attrs = intify_attrs(attrs, strings_map=vocab.strings) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index cd8c81939..c9a20f6c0 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -13,7 +13,7 @@ from ..errors import Errors from ..util import ensure_path, SimpleFrozenList # fmt: off -ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") +ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") # fmt: on diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 2775aa97e..08f795b1a 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken: const_TokenC_ptr -cdef int set_children_from_heads(TokenC* tokens, int length) except -1 +cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1 -cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1 +cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1 cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2 @@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 -cdef int set_children_from_heads(TokenC* tokens, int length) except -1 - - cdef int [:,:] _get_lca_matrix(Doc, int start, int end) cdef class Doc: @@ -49,10 +46,6 @@ cdef class Doc: cdef TokenC* c - cdef public bint is_tagged - cdef public bint is_parsed - cdef public bint is_morphed - cdef public float sentiment cdef public dict user_hooks @@ -74,5 +67,3 @@ cdef class Doc: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cpdef np.ndarray to_array(self, object features) - - cdef void set_parse(self, const TokenC* parsed) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 93520aeda..5c5443258 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,37 +1,34 @@ # cython: infer_types=True, bounds_check=False, profile=True cimport cython cimport numpy as np -from libc.string cimport memcpy, memset +from libc.string cimport memcpy from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter import numpy -import numpy.linalg -import struct import srsly from thinc.api import get_array_module from thinc.util import copy_array import warnings -import copy from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t -from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER +from ..attrs cimport attr_id_t from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB -from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t -from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t +from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM -from ..attrs import intify_attr, intify_attrs, IDS -from ..util import normalize_slice +from ..attrs import intify_attr, IDS from ..compat import copy_reg, pickle from ..errors import Errors, Warnings +from ..morphology import Morphology from .. import util from .underscore import Underscore, get_ext_args from ._retokenize import Retokenizer +from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS DEF PADDING = 5 @@ -190,8 +187,6 @@ cdef class Doc: self.c = data_start + PADDING self.max_length = size self.length = 0 - self.is_tagged = False - self.is_parsed = False self.sentiment = 0.0 self.cats = {} self.user_hooks = {} @@ -221,11 +216,6 @@ cdef class Doc: else: lexeme = self.vocab.get_by_orth(self.mem, word) self.push_back(lexeme, has_space) - # Tough to decide on policy for this. Is an empty doc tagged and parsed? - # There's no information we'd like to add to it, so I guess so? - if self.length == 0: - self.is_tagged = True - self.is_parsed = True @property def _(self): @@ -233,37 +223,61 @@ cdef class Doc: return Underscore(Underscore.doc_extensions, self) @property - def is_sentenced(self): - """Check if the document has sentence boundaries assigned. This is - defined as having at least one of the following: + def is_tagged(self): + warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning) + return self.has_annotation("TAG") - a) An entry "sents" in doc.user_hooks"; - b) Doc.is_parsed is set to True; - c) At least one token other than the first where sent_start is not None. - """ - if "sents" in self.user_hooks: - return True - if self.is_parsed: - return True - if len(self) < 2: - return True - for i in range(1, self.length): - if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: - return True - return False + @property + def is_parsed(self): + warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning) + return self.has_annotation("DEP") @property def is_nered(self): - """Check if the document has named entities set. Will return True if - *any* of the tokens has a named entity tag set (even if the others are - unknown values), or if the document is empty. + warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning) + return self.has_annotation("ENT_IOB") + + @property + def is_sentenced(self): + warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning) + return self.has_annotation("SENT_START") + + def has_annotation(self, attr, *, require_complete=False): + """Check whether the doc contains annotation on a token attribute. + + attr (Union[int, str]): The attribute string name or int ID. + require_complete (bool): Whether to check that the attribute is set on + every token in the doc. + RETURNS (bool): Whether annotation is present. + + DOCS: https://nightly.spacy.io/api/doc#has_annotation """ - if len(self) == 0: + + # empty docs are always annotated + if self.length == 0: return True - for i in range(self.length): - if self.c[i].ent_iob != 0: + cdef int i + cdef int range_start = 0 + attr = intify_attr(attr) + # adjust attributes + if attr == HEAD: + # HEAD does not have an unset state, so rely on DEP + attr = DEP + elif attr == self.vocab.strings["IS_SENT_START"]: + # as in Matcher, allow IS_SENT_START as an alias of SENT_START + attr = SENT_START + # special cases for sentence boundaries + if attr == SENT_START: + if "sents" in self.user_hooks: return True - return False + # docs of length 1 always have sentence boundaries + if self.length == 1: + return True + range_start = 1 + if require_complete: + return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length)) + else: + return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length)) def __getitem__(self, object i): """Get a `Token` or `Span` object. @@ -291,7 +305,7 @@ cdef class Doc: DOCS: https://nightly.spacy.io/api/doc#getitem """ if isinstance(i, slice): - start, stop = normalize_slice(len(self), i.start, i.stop, i.step) + start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step) return Span(self, start, stop, label=0) if i < 0: i = self.length + i @@ -627,16 +641,13 @@ cdef class Doc: @property def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` - objects. Sentence spans have no label. To improve accuracy on informal - texts, spaCy calculates sentence boundaries from the syntactic - dependency parse. If the parser is disabled, the `sents` iterator will - be unavailable. + objects. Sentence spans have no label. YIELDS (Span): Sentences in the document. DOCS: https://nightly.spacy.io/api/doc#sents """ - if not self.is_sentenced: + if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) if "sents" in self.user_hooks: yield from self.user_hooks["sents"](self) @@ -660,10 +671,6 @@ cdef class Doc: return self.vocab.lang cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: - if self.length == 0: - # Flip these to false when we see the first token. - self.is_tagged = False - self.is_parsed = False if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.c[self.length] @@ -786,14 +793,6 @@ cdef class Doc: for i in range(self.length, self.max_length + PADDING): self.c[i].lex = &EMPTY_LEXEME - cdef void set_parse(self, const TokenC* parsed) nogil: - # TODO: This method is fairly misleading atm. It's used by Parser - # to actually apply the parse calculated. Need to rethink this. - # Probably we should use from_array? - self.is_parsed = True - for i in range(self.length): - self.c[i] = parsed[i] - def from_array(self, attrs, array): """Load attributes from a numpy array. Write to a `Doc` object, from an `(M, N)` array of attributes. @@ -818,8 +817,8 @@ cdef class Doc: if array.dtype != numpy.uint64: warnings.warn(Warnings.W028.format(type=array.dtype)) - if SENT_START in attrs and HEAD in attrs: - raise ValueError(Errors.E032) + if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs: + warnings.warn(Warnings.W106) cdef int i, col cdef int32_t abs_head_index cdef attr_id_t attr_id @@ -879,18 +878,17 @@ cdef class Doc: # add morph to morphology table self.vocab.morphology.add(self.vocab.strings[value]) Token.set_struct_attr(token, attr_ids[j], value) - # Set flags - self.is_parsed = bool(self.is_parsed or HEAD in attrs) - self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) - # If document is parsed, set children - if self.is_parsed: - set_children_from_heads(self.c, length) + # If document is parsed, set children and sentence boundaries + if HEAD in attrs and DEP in attrs: + col = attrs.index(DEP) + if array[:, col].any(): + set_children_from_heads(self.c, 0, length) return self @staticmethod def from_docs(docs, ensure_whitespace=True, attrs=None): - """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share - the same `Vocab`. + """Concatenate multiple Doc objects to form a new one. Raises an error + if the `Doc` objects do not all share the same `Vocab`. docs (list): A list of Doc objects. ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace. @@ -908,16 +906,7 @@ cdef class Doc: (vocab,) = vocab if attrs is None: - attrs = [LEMMA, NORM] - if all(doc.is_nered for doc in docs): - attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE]) - # TODO: separate for is_morphed? - if all(doc.is_tagged for doc in docs): - attrs.extend([TAG, POS, MORPH]) - if all(doc.is_parsed for doc in docs): - attrs.extend([HEAD, DEP]) - else: - attrs.append(SENT_START) + attrs = Doc._get_array_attrs() else: if any(isinstance(attr, str) for attr in attrs): # resolve attribute names attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs @@ -989,9 +978,6 @@ cdef class Doc: other.tensor = copy.deepcopy(self.tensor) other.cats = copy.deepcopy(self.cats) other.user_data = copy.deepcopy(self.user_data) - other.is_tagged = self.is_tagged - other.is_parsed = self.is_parsed - other.is_morphed = self.is_morphed other.sentiment = self.sentiment other.has_unknown_spaces = self.has_unknown_spaces other.user_hooks = dict(self.user_hooks) @@ -1065,22 +1051,16 @@ cdef class Doc: DOCS: https://nightly.spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID] - if self.is_tagged: - array_head.extend([TAG, POS]) - # If doc parsed add head and dep attribute - if self.is_parsed: - array_head.extend([HEAD, DEP]) - # Otherwise add sent_start - else: - array_head.append(SENT_START) + array_head = Doc._get_array_attrs() strings = set() for token in self: strings.add(token.tag_) strings.add(token.lemma_) + strings.add(token.morph_) strings.add(token.dep_) strings.add(token.ent_type_) strings.add(token.ent_kb_id_) + strings.add(token.ent_id_) strings.add(token.norm_) # Msgpack doesn't distinguish between lists and tuples, which is # vexing for user data. As a best guess, we *know* that within @@ -1230,22 +1210,29 @@ cdef class Doc: DOCS: https://nightly.spacy.io/api/doc#to_json """ data = {"text": self.text} - if self.is_nered: + if self.has_annotation("ENT_IOB"): data["ents"] = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in self.ents] - if self.is_sentenced: + if self.has_annotation("SENT_START"): sents = list(self.sents) data["sents"] = [{"start": sent.start_char, "end": sent.end_char} for sent in sents] if self.cats: data["cats"] = self.cats data["tokens"] = [] + attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"] + include_annotation = {attr: self.has_annotation(attr) for attr in attrs} for token in self: token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)} - if self.is_tagged: - token_data["pos"] = token.pos_ + if include_annotation["TAG"]: token_data["tag"] = token.tag_ - if self.is_parsed: + if include_annotation["POS"]: + token_data["pos"] = token.pos_ + if include_annotation["MORPH"]: + token_data["morph"] = token.morph_ + if include_annotation["LEMMA"]: + token_data["lemma"] = token.lemma_ + if include_annotation["DEP"]: token_data["dep"] = token.dep_ token_data["head"] = token.head.i data["tokens"].append(token_data) @@ -1291,6 +1278,12 @@ cdef class Doc: j += 1 return output + @staticmethod + def _get_array_attrs(): + attrs = [LENGTH, SPACY] + attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS) + return tuple(attrs) + cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int i = token_by_char(tokens, length, start_char) @@ -1321,13 +1314,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2 return mid return -1 - -cdef int set_children_from_heads(TokenC* tokens, int length) except -1: +cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1: + # note: end is exclusive cdef TokenC* head cdef TokenC* child cdef int i # Set number of left/right children to 0. We'll increment it in the loops. - for i in range(length): + for i in range(start, end): tokens[i].l_kids = 0 tokens[i].r_kids = 0 tokens[i].l_edge = i @@ -1341,38 +1334,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: # without risking getting stuck in an infinite loop if something is # terribly malformed. while not heads_within_sents: - heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count) + heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count) if loop_count > 10: warnings.warn(Warnings.W026) break loop_count += 1 # Set sentence starts - for i in range(length): - if tokens[i].head == 0 and tokens[i].dep != 0: - tokens[tokens[i].l_edge].sent_start = True + for i in range(start, end): + tokens[i].sent_start = -1 + for i in range(start, end): + if tokens[i].head == 0: + tokens[tokens[i].l_edge].sent_start = 1 -cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1: +cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1: # May be called multiple times due to non-projectivity. See issues #3170 # and #4688. # Set left edges cdef TokenC* head cdef TokenC* child cdef int i, j - for i in range(length): + for i in range(start, end): child = &tokens[i] head = &tokens[i + child.head] - if child < head and loop_count == 0: + if loop_count == 0 and child < head: head.l_kids += 1 if child.l_edge < head.l_edge: head.l_edge = child.l_edge if child.r_edge > head.r_edge: head.r_edge = child.r_edge # Set right edges - same as above, but iterate in reverse - for i in range(length-1, -1, -1): + for i in range(end-1, start-1, -1): child = &tokens[i] head = &tokens[i + child.head] - if child > head and loop_count == 0: + if loop_count == 0 and child > head: head.r_kids += 1 if child.r_edge > head.r_edge: head.r_edge = child.r_edge @@ -1380,14 +1375,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce head.l_edge = child.l_edge # Get sentence start positions according to current state sent_starts = set() - for i in range(length): - if tokens[i].head == 0 and tokens[i].dep != 0: + for i in range(start, end): + if tokens[i].head == 0: sent_starts.add(tokens[i].l_edge) cdef int curr_sent_start = 0 cdef int curr_sent_end = 0 # Check whether any heads are not within the current sentence - for i in range(length): - if (i > 0 and i in sent_starts) or i == length - 1: + for i in range(start, end): + if (i > 0 and i in sent_starts) or i == end - 1: curr_sent_end = i for j in range(curr_sent_start, curr_sent_end): if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1: @@ -1436,6 +1431,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): with shape (n, n), where n = len(doc). """ cdef int [:,:] lca_matrix + cdef int j, k n_tokens= end - start lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32) lca_mat.fill(-1) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index f06f3307d..781474d3a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -4,13 +4,10 @@ cimport numpy as np from libc.math cimport sqrt import numpy -import numpy.linalg from thinc.api import get_array_module -from collections import defaultdict import warnings from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix -from .token cimport TokenC from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t @@ -204,7 +201,7 @@ cdef class Span: return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) - def as_doc(self, bint copy_user_data=False): + def as_doc(self, *, bint copy_user_data=False): """Create a `Doc` object with a copy of the `Span`'s data. copy_user_data (bool): Whether or not to copy the original doc's user data. @@ -212,19 +209,10 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#as_doc """ - # TODO: make copy_user_data a keyword-only argument (Python 3 only) words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID] - if self.doc.is_tagged: - array_head.append(TAG) - # If doc parsed add head and dep attribute - if self.doc.is_parsed: - array_head.extend([HEAD, DEP]) - # Otherwise add sent_start - else: - array_head.append(SENT_START) + array_head = self.doc._get_array_attrs() array = self.doc.to_array(array_head) array = array[self.start : self.end] self._fix_dep_copy(array_head, array) @@ -378,7 +366,7 @@ cdef class Span: self.doc.sents # Use `sent_start` token attribute to find sentence boundaries cdef int n = 0 - if self.doc.is_sentenced: + if self.doc.has_annotation("SENT_START"): # Find start of the sentence start = self.start while self.doc.c[start].sent_start != 1 and start > 0: @@ -510,8 +498,6 @@ cdef class Span: DOCS: https://nightly.spacy.io/api/span#noun_chunks """ - if not self.doc.is_parsed: - raise ValueError(Errors.E029) # Accumulate the result before beginning to iterate over it. This # prevents the tokenisation from being changed out from under us # during the iteration. The tricky thing here is that Span accepts diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2474f0637..239de4559 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,4 @@ # cython: infer_types=True -from libc.string cimport memcpy -from cpython.mem cimport PyMem_Malloc, PyMem_Free # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -14,14 +12,13 @@ from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL -from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX -from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP +from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP +from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..symbols cimport conj from .morphanalysis cimport MorphAnalysis +from .doc cimport set_children_from_heads from .. import parts_of_speech -from .. import util from ..errors import Errors, Warnings from .underscore import Underscore, get_ext_args @@ -489,7 +486,7 @@ cdef class Token: return True def __set__(self, value): - if self.doc.is_parsed: + if self.doc.has_annotation("DEP"): raise ValueError(Errors.E043) if value is None: self.c.sent_start = 0 @@ -658,78 +655,19 @@ cdef class Token: # Do nothing if old head is new head if self.i + self.c.head == new_head.i: return - cdef Token old_head = self.head - cdef int rel_newhead_i = new_head.i - self.i - # Is the new head a descendant of the old head - cdef bint is_desc = old_head.is_ancestor(new_head) - cdef int new_edge - cdef Token anc, child - # Update number of deps of old head - if self.c.head > 0: # left dependent - old_head.c.l_kids -= 1 - if self.c.l_edge == old_head.c.l_edge: - # The token dominates the left edge so the left edge of - # the head may change when the token is reattached, it may - # not change if the new head is a descendant of the current - # head. - new_edge = self.c.l_edge - # The new l_edge is the left-most l_edge on any of the - # other dependents where the l_edge is left of the head, - # otherwise it is the head - if not is_desc: - new_edge = old_head.i - for child in old_head.children: - if child == self: - continue - if child.c.l_edge < new_edge: - new_edge = child.c.l_edge - old_head.c.l_edge = new_edge - # Walk up the tree from old_head and assign new l_edge to - # ancestors until an ancestor already has an l_edge that's - # further left - for anc in old_head.ancestors: - if anc.c.l_edge <= new_edge: - break - anc.c.l_edge = new_edge - elif self.c.head < 0: # right dependent - old_head.c.r_kids -= 1 - # Do the same thing as for l_edge - if self.c.r_edge == old_head.c.r_edge: - new_edge = self.c.r_edge - if not is_desc: - new_edge = old_head.i - for child in old_head.children: - if child == self: - continue - if child.c.r_edge > new_edge: - new_edge = child.c.r_edge - old_head.c.r_edge = new_edge - for anc in old_head.ancestors: - if anc.c.r_edge >= new_edge: - break - anc.c.r_edge = new_edge - # Update number of deps of new head - if rel_newhead_i > 0: # left dependent - new_head.c.l_kids += 1 - # Walk up the tree from new head and set l_edge to self.l_edge - # until you hit a token with an l_edge further to the left - if self.c.l_edge < new_head.c.l_edge: - new_head.c.l_edge = self.c.l_edge - for anc in new_head.ancestors: - if anc.c.l_edge <= self.c.l_edge: - break - anc.c.l_edge = self.c.l_edge - elif rel_newhead_i < 0: # right dependent - new_head.c.r_kids += 1 - # Do the same as for l_edge - if self.c.r_edge > new_head.c.r_edge: - new_head.c.r_edge = self.c.r_edge - for anc in new_head.ancestors: - if anc.c.r_edge >= self.c.r_edge: - break - anc.c.r_edge = self.c.r_edge + # Find the widest l/r_edges of the roots of the two tokens involved + # to limit the number of tokens for set_children_from_heads + cdef Token self_root, new_head_root + self_ancestors = list(self.ancestors) + new_head_ancestors = list(new_head.ancestors) + self_root = self_ancestors[-1] if self_ancestors else self + new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head + start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge + end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge # Set new head - self.c.head = rel_newhead_i + self.c.head = new_head.i - self.i + # Adjust parse properties and sentence starts + set_children_from_heads(self.doc.c, start, end + 1) @property def conjuncts(self): diff --git a/spacy/training/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py index 85afdeef3..ebd123375 100644 --- a/spacy/training/converters/conllu2docs.py +++ b/spacy/training/converters/conllu2docs.py @@ -212,8 +212,6 @@ def doc_from_conllu_sentence( doc[i]._.merged_spaceafter = spaces[i] ents = get_entities(lines, ner_tag_pattern, ner_map) doc.ents = spans_from_biluo_tags(doc, ents) - doc.is_parsed = True - doc.is_tagged = True if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) @@ -243,8 +241,6 @@ def doc_from_conllu_sentence( doc_x[i].dep_ = deps[i] doc_x[i].head = doc_x[heads[i]] doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] - doc_x.is_parsed = True - doc_x.is_tagged = True return doc_x diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx index 5dc39eb31..b58df0d71 100644 --- a/spacy/training/gold_io.pyx +++ b/spacy/training/gold_io.pyx @@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} json_para["links"].append(link_dict) biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) + attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") + include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} for j, sent in enumerate(doc.sents): json_sent = {"tokens": [], "brackets": []} for token in sent: json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_} - if doc.is_tagged: + if include_annotation["TAG"]: json_token["tag"] = token.tag_ + if include_annotation["POS"]: json_token["pos"] = token.pos_ + if include_annotation["MORPH"]: json_token["morph"] = token.morph_ + if include_annotation["LEMMA"]: json_token["lemma"] = token.lemma_ - if doc.is_parsed: + if include_annotation["DEP"]: json_token["head"] = token.head.i-token.i json_token["dep"] = token.dep_ - json_token["ner"] = biluo_tags[token.i] + if include_annotation["ENT_IOB"]: + json_token["ner"] = biluo_tags[token.i] json_sent["tokens"].append(json_token) json_para["sentences"].append(json_sent) json_doc["paragraphs"].append(json_para) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 88dc62c2a..380f6a172 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | ----------- | -------------------------------------------------------------------------------------- | | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ | +## Doc.has_annotation {#has_annotation tag="method"} + +Check whether the doc contains annotation on a token attribute. + +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------- | +| `attr` | The attribute string name or int ID. ~~Union[int, str]~~ | +| _keyword-only_ | | +| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ | +| **RETURNS** | Whether specified annotation is present in the doc. ~~bool~~ | + ## Doc.to_array {#to_array tag="method"} Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence @@ -609,26 +620,22 @@ The L2 norm of the document's vector representation. ## Attributes {#attributes} -| Name | Description | -| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `text` | A string representation of the document text. ~~str~~ | -| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | -| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | -| `vocab` | The store of lexical types. ~~Vocab~~ | -| `tensor` 2 | Container for dense vector representations. ~~numpy.ndarray~~ | -| `cats` 2 | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ | -| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | -| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | -| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | -| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ | -| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ | -| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ | -| `is_nered` 2.1 | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ | -| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | -| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | -| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | -| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | A string representation of the document text. ~~str~~ | +| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | +| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | +| `vocab` | The store of lexical types. ~~Vocab~~ | +| `tensor` 2 | Container for dense vector representations. ~~numpy.ndarray~~ | +| `cats` 2 | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ | +| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | +| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | +| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | +| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | +| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | +| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | +| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 44810da58..346b44600 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0. | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | @@ -763,6 +764,25 @@ nlp = spacy.blank("en") + ruler.load_from_tag_map(YOUR_TAG_MAP) ``` +### Migrating Doc flags {#migrating-doc-flags} + +The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and +`Doc.is_sentenced` are deprecated in v3 and replaced by +[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the +token attribute symbols (the same symbols used in `Matcher` patterns): + +```diff +doc = nlp(text) +- doc.is_parsed ++ doc.has_annotation("DEP") +- doc.is_tagged ++ doc.has_annotation("TAG") +- doc.is_sentenced ++ doc.has_annotation("SENT_START") +- doc.is_nered ++ doc.has_annotation("ENT_IOB") +``` + ### Training pipelines and models {#migrating-training} To train your pipelines, you should now pretty much always use the