mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c8fa2247e3
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a18"
|
||||
__version__ = "3.0.0a19"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -121,7 +121,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
warnings.warn(Warnings.W005)
|
||||
if options.get("collapse_phrases", False):
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
|
|
@ -119,6 +119,11 @@ class Warnings:
|
|||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||
"call the {matcher} on each Doc object.")
|
||||
W106 = ("Both HEAD and SENT_START are included as attributes in "
|
||||
"doc.from_array(). The parse trees based on the HEAD attribute "
|
||||
"will override the values in SENT_START.")
|
||||
W107 = ("The property Doc.{prop} is deprecated. Use "
|
||||
"Doc.has_annotation(\"{attr}\") instead.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -192,11 +197,6 @@ class Errors:
|
|||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
||||
"boundaries implicitly, based on the tree structure. This means "
|
||||
"the HEAD attribute would potentially override the sentence "
|
||||
"boundaries set by SENT_START.")
|
||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||
"length {length}.")
|
||||
|
@ -397,8 +397,8 @@ class Errors:
|
|||
E154 = ("One of the attributes or values is not supported for token "
|
||||
"patterns. Please use the option validate=True with Matcher, "
|
||||
"PhraseMatcher, or EntityRuler for more details.")
|
||||
E155 = ("The pipeline needs to include a tagger in order to use "
|
||||
"Matcher or PhraseMatcher with the attributes POS, TAG, or LEMMA. "
|
||||
E155 = ("The pipeline needs to include a {pipe} in order to use "
|
||||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
||||
"instead of list(nlp.tokenizer.pipe()).")
|
||||
E156 = ("The pipeline needs to include a parser in order to use "
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
|
@ -17,4 +21,22 @@ class Bengali(Language):
|
|||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -16,7 +16,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["sb", "oa", "da", "nk", "mo", "ag", "ROOT", "root", "cj", "pd", "og", "app"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
np_deps = set(doc.vocab.strings.add(label) for label in labels)
|
||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
# Further improvement of the models will eliminate the need for this tag.
|
||||
labels = ["nsubj", "obj", "iobj", "appos", "ROOT", "obl"]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "dobj", "nsubjpass", "pcomp", "pobj", "dative", "appos", "attr", "ROOT"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -8,7 +8,7 @@ from ...tokens import Doc, Span, Token
|
|||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
doc = doclike.doc
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
if not len(doc):
|
||||
return
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class PersianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,22 @@ class Persian(Language):
|
|||
Defaults = PersianDefaults
|
||||
|
||||
|
||||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -19,7 +19,7 @@ def noun_chunks(doclike):
|
|||
]
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
np_deps = [doc.vocab.strings.add(label) for label in labels]
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -13,7 +13,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
|
@ -20,4 +24,22 @@ class Norwegian(Language):
|
|||
Defaults = NorwegianDefaults
|
||||
|
||||
|
||||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
from typing import Optional
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -22,4 +27,22 @@ class Swedish(Language):
|
|||
Defaults = SwedishDefaults
|
||||
|
||||
|
||||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -11,7 +11,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]:
|
|||
labels = ["nsubj", "nsubj:pass", "dobj", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
|
|
|
@ -8,7 +8,7 @@ from contextlib import contextmanager
|
|||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
|
||||
from thinc.api import Model, get_current_ops, Config, require_gpu, Optimizer
|
||||
import srsly
|
||||
import multiprocessing as mp
|
||||
from itertools import chain, cycle
|
||||
|
@ -1448,10 +1448,15 @@ class Language:
|
|||
"""Register 'listeners' within pipeline components, to allow them to
|
||||
effectively share weights.
|
||||
"""
|
||||
# I had though, "Why do we do this inside the Language object? Shouldn't
|
||||
# it be the tok2vec/transformer/etc's job?
|
||||
# The problem is we need to do it during deserialization...And the
|
||||
# components don't receive the pipeline then. So this does have to be
|
||||
# here :(
|
||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||
if hasattr(proc1, "find_listeners"):
|
||||
for name2, proc2 in self.pipeline[i:]:
|
||||
if hasattr(proc2, "model"):
|
||||
for name2, proc2 in self.pipeline[i+1:]:
|
||||
if isinstance(getattr(proc2, "model", None), Model):
|
||||
proc1.find_listeners(proc2.model)
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -17,7 +17,7 @@ from ..vocab cimport Vocab
|
|||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
|
@ -215,10 +215,15 @@ cdef class Matcher:
|
|||
else:
|
||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||
cdef Pool tmp_pool = Pool()
|
||||
if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \
|
||||
and not doc.is_tagged:
|
||||
raise ValueError(Errors.E155.format())
|
||||
if DEP in self._seen_attrs and not doc.is_parsed:
|
||||
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if POS in self._seen_attrs and not doc.has_annotation("POS"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E156.format())
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates)
|
||||
|
|
|
@ -4,7 +4,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
|
|||
|
||||
import warnings
|
||||
|
||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.span cimport Span
|
||||
|
@ -184,12 +184,20 @@ cdef class PhraseMatcher:
|
|||
if len(doc) == 0:
|
||||
continue
|
||||
if isinstance(doc, Doc):
|
||||
if self.attr in (POS, TAG, LEMMA) and not doc.is_tagged:
|
||||
raise ValueError(Errors.E155.format())
|
||||
if self.attr == DEP and not doc.is_parsed:
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
if self.attr == TAG and not has_annotation[TAG]:
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if self.attr == POS and not has_annotation[POS]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if self.attr == MORPH and not has_annotation[MORPH]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if self.attr == LEMMA and not has_annotation[LEMMA]:
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if self.attr == DEP and not has_annotation[DEP]:
|
||||
raise ValueError(Errors.E156.format())
|
||||
if self._validate and (doc.is_tagged or doc.is_parsed) \
|
||||
and self.attr not in (DEP, POS, TAG, LEMMA):
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
warnings.warn(Warnings.W012.format(key=key, attr=string_attr))
|
||||
keyword = self._convert_to_array(doc)
|
||||
|
|
|
@ -164,7 +164,7 @@ def MultiHashEmbed(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
|
||||
"""Construct an embedded representation based on character embeddings, using
|
||||
a feed-forward network. A fixed number of UTF-8 byte characters are used for
|
||||
each word, taken from the beginning and end of the word equally. Padding is
|
||||
|
@ -188,18 +188,35 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
|||
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
|
||||
are between 3 and 8, although it may depend on the length of words in the
|
||||
language.
|
||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||
"""
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
if also_use_static_vectors:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
StaticVectors(width, dropout=0.0),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
ragged2list(),
|
||||
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
|
||||
ragged2list(),
|
||||
)
|
||||
else:
|
||||
model = chain(
|
||||
concatenate(
|
||||
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
|
||||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
ragged2list(),
|
||||
)
|
||||
return model
|
||||
|
||||
|
|
|
@ -679,8 +679,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
st._sent[i].dep = self.root_label
|
||||
|
||||
def finalize_doc(self, Doc doc):
|
||||
doc.is_parsed = True
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
|
||||
def has_gold(self, Example eg, start=0, end=None):
|
||||
for word in eg.y[start:end]:
|
||||
|
|
|
@ -119,7 +119,7 @@ cpdef deprojectivize(Doc doc):
|
|||
new_head = _find_new_head(doc[i], head_label)
|
||||
doc.c[i].head = new_head.i - i
|
||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ def merge_noun_chunks(doc: Doc) -> Doc:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/pipeline-functions#merge_noun_chunks
|
||||
"""
|
||||
if not doc.is_parsed:
|
||||
if not doc.has_annotation("DEP"):
|
||||
return doc
|
||||
with doc.retokenize() as retokenizer:
|
||||
for np in doc.noun_chunks:
|
||||
|
|
|
@ -32,6 +32,7 @@ width = 128
|
|||
rows = 7000
|
||||
nM = 64
|
||||
nC = 8
|
||||
also_use_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
|
@ -203,8 +204,6 @@ class Morphologizer(Tagger):
|
|||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||
|
||||
doc.is_morphed = True
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
their predicted scores.
|
||||
|
@ -259,79 +258,3 @@ class Morphologizer(Tagger):
|
|||
results.update(Scorer.score_token_attr_per_feat(examples,
|
||||
"morph", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The loaded Morphologizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: load_model(b),
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Morphologizer): The modified Morphologizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
try:
|
||||
self.model.from_bytes(file_.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
||||
"model": load_model,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -170,79 +170,3 @@ class SentenceRecognizer(Tagger):
|
|||
results = Scorer.score_spans(examples, "sents", **kwargs)
|
||||
del results["sents_per_type"]
|
||||
return results
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The loaded SentenceRecognizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_bytes
|
||||
"""
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda b: self.vocab.from_bytes(b),
|
||||
"cfg": lambda b: self.cfg.update(srsly.json_loads(b)),
|
||||
"model": lambda b: load_model(b),
|
||||
}
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#to_disk
|
||||
"""
|
||||
serialize = {
|
||||
"vocab": lambda p: self.vocab.to_disk(p),
|
||||
"model": lambda p: p.open("wb").write(self.model.to_bytes()),
|
||||
"cfg": lambda p: srsly.write_json(p, self.cfg),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Tagger): The modified SentenceRecognizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#from_disk
|
||||
"""
|
||||
def load_model(p):
|
||||
with p.open("rb") as file_:
|
||||
try:
|
||||
self.model.from_bytes(file_.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {
|
||||
"vocab": lambda p: self.vocab.from_disk(p),
|
||||
"cfg": lambda p: self.cfg.update(deserialize_config(p)),
|
||||
"model": load_model,
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
|
|
@ -168,7 +168,6 @@ class Tagger(Pipe):
|
|||
# Don't clobber preset POS tags
|
||||
if doc.c[j].tag == 0:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
|
|
|
@ -106,6 +106,7 @@ def test_doc_api_serialize(en_tokenizer, text):
|
|||
tokens = en_tokenizer(text)
|
||||
tokens[0].lemma_ = "lemma"
|
||||
tokens[0].norm_ = "norm"
|
||||
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 0, 1)]
|
||||
tokens[0].ent_kb_id_ = "ent_kb_id"
|
||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||
assert tokens.text == new_tokens.text
|
||||
|
@ -144,7 +145,6 @@ def test_doc_api_set_ents(en_tokenizer):
|
|||
|
||||
def test_doc_api_sents_empty_string(en_tokenizer):
|
||||
doc = en_tokenizer("")
|
||||
doc.is_parsed = True
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 0
|
||||
|
||||
|
@ -181,10 +181,11 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||
deps = ["dep"] * len(heads)
|
||||
# fmt: on
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert doc[6].text == "for"
|
||||
subtree = [w.text for w in doc[6].subtree]
|
||||
# fmt: off
|
||||
|
@ -240,7 +241,9 @@ def test_doc_api_similarity_match():
|
|||
)
|
||||
def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
||||
tokens = en_tokenizer(sentence)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(
|
||||
tokens.vocab, [t.text for t in tokens], heads=heads, deps=["dep"] * len(heads)
|
||||
)
|
||||
lca = doc.get_lca_matrix()
|
||||
assert (lca == lca_matrix).all()
|
||||
assert lca[1, 1] == 1
|
||||
|
@ -251,51 +254,55 @@ def test_lowest_common_ancestor(en_tokenizer, sentence, heads, lca_matrix):
|
|||
def test_doc_is_nered(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
assert not doc.is_nered
|
||||
assert not doc.has_annotation("ENT_IOB")
|
||||
doc.ents = [Span(doc, 3, 5, label="GPE")]
|
||||
assert doc.is_nered
|
||||
assert doc.has_annotation("ENT_IOB")
|
||||
# Test creating doc from array with unknown values
|
||||
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
|
||||
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
|
||||
assert doc.is_nered
|
||||
assert doc.has_annotation("ENT_IOB")
|
||||
# Test serialization
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.is_nered
|
||||
assert new_doc.has_annotation("ENT_IOB")
|
||||
|
||||
|
||||
def test_doc_from_array_sent_starts(en_vocab):
|
||||
words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."]
|
||||
heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6]
|
||||
heads = [0, -1, -2, -3, -4, -5, 0, -1, -2, -3]
|
||||
# fmt: off
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep", "dep"]
|
||||
deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"]
|
||||
# fmt: on
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, (dep, head) in enumerate(zip(deps, heads)):
|
||||
doc[i].dep_ = dep
|
||||
doc[i].head = doc[head]
|
||||
if head == i:
|
||||
doc[i].is_sent_start = True
|
||||
doc.is_parsed
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
|
||||
# HEAD overrides SENT_START with warning
|
||||
attrs = [SENT_START, HEAD]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
with pytest.raises(ValueError):
|
||||
with pytest.warns(UserWarning):
|
||||
new_doc.from_array(attrs, arr)
|
||||
|
||||
attrs = [SENT_START, DEP]
|
||||
# no warning using default attrs
|
||||
attrs = doc._get_array_attrs()
|
||||
arr = doc.to_array(attrs)
|
||||
with pytest.warns(None) as record:
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert len(record) == 0
|
||||
|
||||
# only SENT_START uses SENT_START
|
||||
attrs = [SENT_START]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||
assert not new_doc.is_parsed
|
||||
assert not new_doc.has_annotation("DEP")
|
||||
|
||||
# only HEAD uses HEAD
|
||||
attrs = [HEAD, DEP]
|
||||
arr = doc.to_array(attrs)
|
||||
new_doc = Doc(en_vocab, words=words)
|
||||
new_doc.from_array(attrs, arr)
|
||||
assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc]
|
||||
assert new_doc.is_parsed
|
||||
assert new_doc.has_annotation("DEP")
|
||||
|
||||
|
||||
def test_doc_from_array_morph(en_vocab):
|
||||
|
@ -365,9 +372,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||
with pytest.raises(ValueError):
|
||||
# important attributes from sentenziser or parser are missing
|
||||
assert list(m_doc.sents)
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
# space delimiter considered, although spacy attribute was missing
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
|
@ -379,6 +383,15 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
|
||||
|
||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
docs = [en_tokenizer(t) for t in texts]
|
||||
docs[0].ents = ()
|
||||
docs[1].ents = (Span(docs[1], 0, 1, label="foo"),)
|
||||
doc = Doc.from_docs(docs)
|
||||
assert len(doc.ents) == 1
|
||||
|
||||
|
||||
def test_doc_lang(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
assert doc.lang_ == "en"
|
||||
|
@ -399,3 +412,45 @@ def test_token_lexeme(en_vocab):
|
|||
assert isinstance(token.lex, Lexeme)
|
||||
assert token.lex.text == token.text
|
||||
assert en_vocab[token.orth] == token.lex
|
||||
|
||||
|
||||
def test_has_annotation(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
doc[0].morph_ = "Feat=Val"
|
||||
doc[0].lemma_ = "a"
|
||||
doc[0].dep_ = "dep"
|
||||
doc[0].head = doc[1]
|
||||
doc.ents = [Span(doc, 0, 1, label="HELLO")]
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[1].tag_ = "A"
|
||||
doc[1].pos_ = "X"
|
||||
doc[1].morph_ = ""
|
||||
doc[1].lemma_ = "a"
|
||||
doc[1].dep_ = "dep"
|
||||
doc.ents = [Span(doc, 0, 2, label="HELLO")]
|
||||
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_is_flags_deprecated(en_tokenizer):
|
||||
doc = en_tokenizer("test")
|
||||
with pytest.deprecated_call():
|
||||
doc.is_tagged
|
||||
with pytest.deprecated_call():
|
||||
doc.is_parsed
|
||||
with pytest.deprecated_call():
|
||||
doc.is_nered
|
||||
with pytest.deprecated_call():
|
||||
doc.is_sentenced
|
||||
|
|
|
@ -24,7 +24,6 @@ def doc_not_parsed(en_tokenizer):
|
|||
text = "This is a sentence. This is another sentence. And a third."
|
||||
tokens = en_tokenizer(text)
|
||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||
doc.is_parsed = False
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -71,8 +70,9 @@ def test_spans_string_fn(doc):
|
|||
def test_spans_root2(en_tokenizer):
|
||||
text = "through North and South Carolina"
|
||||
heads = [0, 3, -1, -2, -4]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert doc[-2:].root.text == "Carolina"
|
||||
|
||||
|
||||
|
@ -92,7 +92,7 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
|||
def test_spans_lca_matrix(en_tokenizer):
|
||||
"""Test span's lca matrix generation"""
|
||||
tokens = en_tokenizer("the lazy dog slept")
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
|
||||
lca = doc[:2].get_lca_matrix()
|
||||
assert lca.shape == (2, 2)
|
||||
assert lca[0, 0] == 0 # the & the -> the
|
||||
|
|
|
@ -112,11 +112,11 @@ def test_doc_token_api_ancestors(en_tokenizer):
|
|||
|
||||
|
||||
def test_doc_token_api_head_setter(en_tokenizer):
|
||||
# the structure of this sentence depends on the English annotation scheme
|
||||
text = "Yesterday I saw a dog that barked loudly."
|
||||
heads = [2, 1, 0, 1, -2, 1, -2, -1, -6]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
assert doc[6].n_lefts == 1
|
||||
assert doc[6].n_rights == 1
|
||||
|
@ -169,13 +169,46 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
|||
with pytest.raises(ValueError):
|
||||
doc[0].head = doc2[0]
|
||||
|
||||
# test sentence starts when two sentences are joined
|
||||
text = "This is one sentence. This is another sentence."
|
||||
heads = [0, -1, -2, -3, -4, 0, -1, -2, -3, -4]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=heads,
|
||||
deps=["dep"] * len(heads),
|
||||
)
|
||||
# initially two sentences
|
||||
assert doc[0].is_sent_start
|
||||
assert doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
|
||||
# modifying with a sentence doesn't change sent starts
|
||||
doc[2].head = doc[3]
|
||||
assert doc[0].is_sent_start
|
||||
assert doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[4]
|
||||
assert doc[5].left_edge == doc[5]
|
||||
assert doc[5].right_edge == doc[9]
|
||||
|
||||
# attach the second sentence to the first, resulting in one sentence
|
||||
doc[5].head = doc[0]
|
||||
assert doc[0].is_sent_start
|
||||
assert not doc[5].is_sent_start
|
||||
assert doc[0].left_edge == doc[0]
|
||||
assert doc[0].right_edge == doc[9]
|
||||
|
||||
|
||||
def test_is_sent_start(en_tokenizer):
|
||||
doc = en_tokenizer("This is a sentence. This is another.")
|
||||
assert doc[5].is_sent_start is None
|
||||
doc[5].is_sent_start = True
|
||||
assert doc[5].is_sent_start is True
|
||||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
|
@ -184,7 +217,6 @@ def test_is_sent_end(en_tokenizer):
|
|||
assert doc[4].is_sent_end is None
|
||||
doc[5].is_sent_start = True
|
||||
assert doc[4].is_sent_end is True
|
||||
doc.is_parsed = True
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
|
@ -209,14 +241,14 @@ def test_token0_has_sent_start_true():
|
|||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_start is True
|
||||
assert doc[1].is_sent_start is None
|
||||
assert not doc.is_sentenced
|
||||
assert not doc.has_annotation("SENT_START")
|
||||
|
||||
|
||||
def test_tokenlast_has_sent_end_true():
|
||||
doc = Doc(Vocab(), words=["hello", "world"])
|
||||
assert doc[0].is_sent_end is None
|
||||
assert doc[1].is_sent_end is True
|
||||
assert not doc.is_sentenced
|
||||
assert not doc.has_annotation("SENT_START")
|
||||
|
||||
|
||||
def test_token_api_conjuncts_chain(en_vocab):
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_de(de_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'de' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = de_tokenizer("Er lag auf seinem")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_el(el_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'el' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = el_tokenizer("είναι χώρα της νοτιοανατολικής")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -11,12 +11,8 @@ from ...util import get_doc
|
|||
|
||||
def test_noun_chunks_is_parsed(en_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = en_tokenizer("This is a sentence")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
|
|
@ -7,8 +7,9 @@ from ...util import get_doc, apply_transition_sequence
|
|||
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text + punct)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert len(doc) == 4 if punct else 3
|
||||
assert len(list(doc.sents)) == 1
|
||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_es(es_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'es' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = es_tokenizer("en Oxford este verano")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,12 +3,8 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_fa(fa_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fa' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
|
||||
doc = fa_tokenizer("این یک جمله نمونه می باشد.")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_fr(fr_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'fr' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = fr_tokenizer("trouver des travaux antérieurs")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_id(id_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'id' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = id_tokenizer("sebelas")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -3,11 +3,7 @@ import pytest
|
|||
|
||||
def test_noun_chunks_is_parsed_nb(nb_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'nb' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = nb_tokenizer("Smørsausen brukes bl.a. til")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
|
|
@ -5,12 +5,8 @@ from ...util import get_doc
|
|||
|
||||
def test_noun_chunks_is_parsed_sv(sv_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'sv' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = sv_tokenizer("Studenten läste den bästa boken")
|
||||
doc.is_parsed = False
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
|||
# Only include languages with no external dependencies
|
||||
# excluded: ru, uk
|
||||
# excluded for custom tables: pl
|
||||
LANGUAGES = ["el", "en", "fr", "nl"]
|
||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -301,11 +301,14 @@ def test_matcher_basic_check(en_vocab):
|
|||
|
||||
def test_attr_pipeline_checks(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires is_parsed
|
||||
# DEP requires DEP
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"DEP": "a"}]])
|
||||
matcher(doc1)
|
||||
|
@ -313,7 +316,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher(doc2)
|
||||
with pytest.raises(ValueError):
|
||||
matcher(doc3)
|
||||
# TAG, POS, LEMMA require is_tagged
|
||||
# TAG, POS, LEMMA require those values
|
||||
for attr in ("TAG", "POS", "LEMMA"):
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{attr: "a"}]])
|
||||
|
|
|
@ -187,9 +187,11 @@ def test_phrase_matcher_bool_attrs(en_vocab):
|
|||
|
||||
def test_phrase_matcher_validation(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
matcher = PhraseMatcher(en_vocab, validate=True)
|
||||
with pytest.warns(UserWarning):
|
||||
|
@ -212,18 +214,21 @@ def test_attr_validation(en_vocab):
|
|||
|
||||
def test_attr_pipeline_checks(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["Test"])
|
||||
doc1.is_parsed = True
|
||||
doc1[0].dep_ = "ROOT"
|
||||
doc2 = Doc(en_vocab, words=["Test"])
|
||||
doc2.is_tagged = True
|
||||
doc2[0].tag_ = "TAG"
|
||||
doc2[0].pos_ = "X"
|
||||
doc2[0].morph_ = "Feat=Val"
|
||||
doc2[0].lemma_ = "LEMMA"
|
||||
doc3 = Doc(en_vocab, words=["Test"])
|
||||
# DEP requires is_parsed
|
||||
# DEP requires DEP
|
||||
matcher = PhraseMatcher(en_vocab, attr="DEP")
|
||||
matcher.add("TEST1", [doc1])
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST2", [doc2])
|
||||
with pytest.raises(ValueError):
|
||||
matcher.add("TEST3", [doc3])
|
||||
# TAG, POS, LEMMA require is_tagged
|
||||
# TAG, POS, LEMMA require those values
|
||||
for attr in ("TAG", "POS", "LEMMA"):
|
||||
matcher = PhraseMatcher(en_vocab, attr=attr)
|
||||
matcher.add("TEST2", [doc2])
|
||||
|
|
|
@ -67,8 +67,9 @@ def test_parser_initial(en_tokenizer, en_parser):
|
|||
def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
||||
text = "The four wheels on the bus turned quickly"
|
||||
heads = [2, 1, 4, -1, 1, -2, 0, -1]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
assert len(list(doc[2].lefts)) == 2
|
||||
assert len(list(doc[2].rights)) == 1
|
||||
assert len(list(doc[2].children)) == 3
|
||||
|
@ -184,7 +185,7 @@ def test_parser_set_sent_starts(en_vocab):
|
|||
if i == 0 or i == 3:
|
||||
assert doc[i].is_sent_start is True
|
||||
else:
|
||||
assert doc[i].is_sent_start is None
|
||||
assert doc[i].is_sent_start is False
|
||||
for sent in doc.sents:
|
||||
for token in sent:
|
||||
assert token.head in sent
|
||||
|
|
|
@ -63,7 +63,7 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
|||
|
||||
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
|
||||
|
||||
lefts = {}
|
||||
rights = {}
|
||||
|
|
|
@ -8,8 +8,9 @@ from ..util import get_doc, apply_transition_sequence
|
|||
def test_parser_space_attachment(en_tokenizer):
|
||||
text = "This is a test.\nTo ensure spaces are attached well."
|
||||
heads = [1, 0, 1, -2, -3, -1, 1, 4, -1, 2, 1, 0, -1, -2]
|
||||
deps = ["dep"] * len(heads)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
for sent in doc.sents:
|
||||
if len(sent) == 1:
|
||||
assert not sent[-1].is_space
|
||||
|
|
|
@ -72,6 +72,8 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||
|
@ -82,6 +84,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
# initialize with patterns from asset
|
||||
nlp.add_pipe(
|
||||
|
@ -93,6 +97,8 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
|||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
assert doc.has_annotation("LEMMA")
|
||||
assert doc.has_annotation("MORPH")
|
||||
|
||||
|
||||
def test_attributeruler_score(nlp, pattern_dicts):
|
||||
|
|
|
@ -35,8 +35,6 @@ def doc2(en_tokenizer):
|
|||
deps=deps,
|
||||
)
|
||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ def test_sentencizer(en_vocab):
|
|||
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
|
||||
sentencizer = Sentencizer(punct_chars=None)
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
sent_ends = [t.is_sent_end for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
|
@ -22,13 +22,13 @@ def test_sentencizer_pipe():
|
|||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
for doc in nlp.pipe(texts):
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
assert len(list(doc.sents)) == 2
|
||||
for ex in nlp.pipe(texts):
|
||||
doc = ex.doc
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
@ -42,7 +42,7 @@ def test_sentencizer_empty_docs():
|
|||
nlp.add_pipe("sentencizer")
|
||||
for texts in [one_empty_text, many_empty_texts, some_empty_texts]:
|
||||
for doc in nlp.pipe(texts):
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
if len(doc) == 0:
|
||||
assert sent_starts == []
|
||||
|
@ -82,7 +82,7 @@ def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
|
|||
doc = Doc(en_vocab, words=words)
|
||||
sentencizer = Sentencizer(punct_chars=None)
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
assert [t.is_sent_start for t in doc] == sent_starts
|
||||
assert [t.is_sent_end for t in doc] == sent_ends
|
||||
assert len(list(doc.sents)) == n_sents
|
||||
|
@ -115,7 +115,7 @@ def test_sentencizer_custom_punct(
|
|||
doc = Doc(en_vocab, words=words)
|
||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
assert [t.is_sent_start for t in doc] == sent_starts
|
||||
assert [t.is_sent_end for t in doc] == sent_ends
|
||||
assert len(list(doc.sents)) == n_sents
|
||||
|
|
|
@ -94,7 +94,6 @@ def test_issue309(en_tokenizer):
|
|||
doc = get_doc(
|
||||
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||
)
|
||||
doc.is_parsed = True
|
||||
assert len(doc) == 1
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 1
|
||||
|
@ -170,11 +169,9 @@ def test_issue595():
|
|||
|
||||
def test_issue599(en_vocab):
|
||||
doc = Doc(en_vocab)
|
||||
doc.is_tagged = True
|
||||
doc.is_parsed = True
|
||||
doc2 = Doc(doc.vocab)
|
||||
doc2.from_bytes(doc.to_bytes())
|
||||
assert doc2.is_parsed
|
||||
assert doc2.has_annotation("DEP")
|
||||
|
||||
|
||||
def test_issue600():
|
||||
|
|
|
@ -14,7 +14,7 @@ from spacy.tokens import Doc, Span, Token
|
|||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ..util import make_tempdir, get_doc
|
||||
|
||||
|
||||
def test_issue1506():
|
||||
|
@ -198,17 +198,26 @@ def test_issue1834():
|
|||
"""Test that sentence boundaries & parse/tag flags are not lost
|
||||
during serialization."""
|
||||
string = "This is a first sentence . And another one"
|
||||
doc = Doc(Vocab(), words=string.split())
|
||||
doc[6].sent_start = True
|
||||
words = string.split()
|
||||
doc = get_doc(Vocab(), words=words)
|
||||
doc[6].is_sent_start = True
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc[6].sent_start
|
||||
assert not new_doc.is_parsed
|
||||
assert not new_doc.is_tagged
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
assert not new_doc.has_annotation("DEP")
|
||||
assert not new_doc.has_annotation("TAG")
|
||||
doc = get_doc(
|
||||
Vocab(),
|
||||
words=words,
|
||||
tags=["TAG"] * len(words),
|
||||
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
|
||||
deps=["dep"] * len(words),
|
||||
)
|
||||
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
|
||||
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
||||
assert new_doc.is_parsed
|
||||
assert new_doc.is_tagged
|
||||
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
|
||||
assert new_doc[6].sent_start
|
||||
assert new_doc.has_annotation("DEP")
|
||||
assert new_doc.has_annotation("TAG")
|
||||
|
||||
|
||||
def test_issue1868():
|
||||
|
|
|
@ -72,8 +72,6 @@ def test_issue2219(en_vocab):
|
|||
def test_issue2361(de_tokenizer):
|
||||
chars = ("<", ">", "&", """)
|
||||
doc = de_tokenizer('< > & " ')
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
html = render(doc)
|
||||
for char in chars:
|
||||
assert char in html
|
||||
|
@ -108,6 +106,7 @@ def test_issue2385_biluo(tags):
|
|||
def test_issue2396(en_vocab):
|
||||
words = ["She", "created", "a", "test", "for", "spacy"]
|
||||
heads = [1, 0, 1, -2, -1, -1]
|
||||
deps = ["dep"] * len(heads)
|
||||
matrix = numpy.array(
|
||||
[
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
|
@ -119,7 +118,7 @@ def test_issue2396(en_vocab):
|
|||
],
|
||||
dtype=numpy.int32,
|
||||
)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span = doc[:]
|
||||
assert (doc.get_lca_matrix() == matrix).all()
|
||||
assert (span.get_lca_matrix() == matrix).all()
|
||||
|
|
|
@ -16,16 +16,16 @@ from ..util import get_doc
|
|||
|
||||
|
||||
def test_issue2564():
|
||||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
doc = nlp("hello world")
|
||||
assert doc.is_tagged
|
||||
assert doc.has_annotation("TAG")
|
||||
docs = nlp.pipe(["hello", "world"])
|
||||
piped_doc = next(docs)
|
||||
assert piped_doc.is_tagged
|
||||
assert piped_doc.has_annotation("TAG")
|
||||
|
||||
|
||||
def test_issue2569(en_tokenizer):
|
||||
|
@ -123,7 +123,7 @@ def test_issue2772(en_vocab):
|
|||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, 2, 1, -3, -4]
|
||||
deps = ["dep"] * len(heads)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
assert doc[1].is_sent_start is None
|
||||
assert doc[1].is_sent_start is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||
|
|
|
@ -63,7 +63,7 @@ def test_issue3012(en_vocab):
|
|||
pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
|
||||
ents = [(2, 4, "PERCENT")]
|
||||
doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
|
||||
assert doc.is_tagged
|
||||
assert doc.has_annotation("TAG")
|
||||
|
||||
expected = ("10", "NUM", "CD", "PERCENT")
|
||||
assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
|
||||
|
@ -83,10 +83,14 @@ def test_issue3012(en_vocab):
|
|||
def test_issue3199():
|
||||
"""Test that Span.noun_chunks works correctly if no noun chunks iterator
|
||||
is available. To make this test future-proof, we're constructing a Doc
|
||||
with a new Vocab here and setting is_parsed to make sure the noun chunks run.
|
||||
with a new Vocab here and a parse tree to make sure the noun chunks run.
|
||||
"""
|
||||
doc = Doc(Vocab(), words=["This", "is", "a", "sentence"])
|
||||
doc.is_parsed = True
|
||||
doc = get_doc(
|
||||
Vocab(),
|
||||
words=["This", "is", "a", "sentence"],
|
||||
heads=[0, -1, -2, -3],
|
||||
deps=["dep"] * 4,
|
||||
)
|
||||
assert list(doc[0:3].noun_chunks) == []
|
||||
|
||||
|
||||
|
@ -250,16 +254,16 @@ def test_issue3456():
|
|||
|
||||
|
||||
def test_issue3468():
|
||||
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can
|
||||
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
|
||||
be restored after serialization."""
|
||||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
doc = nlp("Hello world")
|
||||
assert doc[0].is_sent_start
|
||||
assert doc.is_sentenced
|
||||
assert doc.has_annotation("SENT_START")
|
||||
assert len(list(doc.sents)) == 1
|
||||
doc_bytes = doc.to_bytes()
|
||||
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
|
||||
assert new_doc[0].is_sent_start
|
||||
assert new_doc.is_sentenced
|
||||
assert new_doc.has_annotation("SENT_START")
|
||||
assert len(list(new_doc.sents)) == 1
|
||||
|
|
|
@ -356,7 +356,6 @@ def test_issue3882(en_vocab):
|
|||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
||||
|
||||
|
@ -386,7 +385,6 @@ def test_issue3959():
|
|||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
|
|
|
@ -189,7 +189,6 @@ def test_issue4133(en_vocab):
|
|||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
doc_bytes = doc.to_bytes()
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
|
@ -249,7 +248,7 @@ def test_issue4267():
|
|||
assert "ner" in nlp.pipe_names
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
assert doc1.has_annotation("ENT_IOB")
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
# add entity ruler and run again
|
||||
|
@ -260,7 +259,7 @@ def test_issue4267():
|
|||
assert "ner" in nlp.pipe_names
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
assert doc2.has_annotation("ENT_IOB")
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
|
|
|
@ -298,4 +298,4 @@ def test_language_init_invalid_vocab(value):
|
|||
err_fragment = "invalid value"
|
||||
with pytest.raises(ValueError) as e:
|
||||
Language(value)
|
||||
assert err_fragment in str(e)
|
||||
assert err_fragment in str(e.value)
|
||||
|
|
|
@ -80,7 +80,6 @@ def tagged_doc():
|
|||
doc[i].morph_ = morphs[i]
|
||||
if i > 0:
|
||||
doc[i].is_sent_start = False
|
||||
doc.is_tagged = True
|
||||
return doc
|
||||
|
||||
|
||||
|
|
|
@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
[
|
||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
|
|
|
@ -12,7 +12,7 @@ from thinc.api import compounding
|
|||
import pytest
|
||||
import srsly
|
||||
|
||||
from .util import make_tempdir
|
||||
from .util import make_tempdir, get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -26,24 +26,16 @@ def doc():
|
|||
"NounType=prop|Number=sing", "PunctType=peri"]
|
||||
# head of '.' is intentionally nonprojective for testing
|
||||
heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5]
|
||||
heads = [head - i for i, head in enumerate(heads)]
|
||||
deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||
lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."]
|
||||
biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
ents = ((0, 2, "PERSON"), (5, 7, "LOC"), (8, 9, "GPE"))
|
||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||
# fmt: on
|
||||
nlp = English()
|
||||
doc = nlp(text)
|
||||
for i in range(len(tags)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = pos[i]
|
||||
doc[i].morph_ = morphs[i]
|
||||
doc[i].lemma_ = lemmas[i]
|
||||
doc[i].dep_ = deps[i]
|
||||
doc[i].head = doc[heads[i]]
|
||||
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
||||
words = [t.text for t in nlp.make_doc(text)]
|
||||
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
|
||||
doc.cats = cats
|
||||
doc.is_tagged = True
|
||||
doc.is_parsed = True
|
||||
return doc
|
||||
|
||||
|
||||
|
@ -194,7 +186,7 @@ def test_json2docs_no_ner(en_vocab):
|
|||
docs = json2docs(data)
|
||||
assert len(docs) == 1
|
||||
for doc in docs:
|
||||
assert not doc.is_nered
|
||||
assert not doc.has_annotation("ENT_IOB")
|
||||
for token in doc:
|
||||
assert token.ent_iob == 0
|
||||
eg = Example(
|
||||
|
|
|
@ -274,7 +274,7 @@ def _merge(Doc doc, merges):
|
|||
for i in range(doc.length):
|
||||
doc.c[i].head -= i
|
||||
# Set the left/right children, left/right edges
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
# Make sure ent_iob remains consistent
|
||||
make_iob_consistent(doc.c, doc.length)
|
||||
# Return the merged Python object
|
||||
|
@ -381,7 +381,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
|||
for i in range(doc.length):
|
||||
doc.c[i].head -= i
|
||||
# set children from head
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
|
||||
|
||||
def _validate_extensions(extensions):
|
||||
|
@ -408,7 +408,6 @@ cdef make_iob_consistent(TokenC* tokens, int length):
|
|||
def normalize_token_attrs(Vocab vocab, attrs):
|
||||
if "_" in attrs: # Extension attributes
|
||||
extensions = attrs["_"]
|
||||
print("EXTENSIONS", extensions)
|
||||
_validate_extensions(extensions)
|
||||
attrs = {key: value for key, value in attrs.items() if key != "_"}
|
||||
attrs = intify_attrs(attrs, strings_map=vocab.strings)
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..errors import Errors
|
|||
from ..util import ensure_path, SimpleFrozenList
|
||||
|
||||
# fmt: off
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
|
|||
const_TokenC_ptr
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1
|
||||
|
||||
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2
|
||||
|
@ -31,9 +31,6 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
|
|||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1
|
||||
|
||||
|
||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
||||
|
||||
cdef class Doc:
|
||||
|
@ -49,10 +46,6 @@ cdef class Doc:
|
|||
|
||||
cdef TokenC* c
|
||||
|
||||
cdef public bint is_tagged
|
||||
cdef public bint is_parsed
|
||||
cdef public bint is_morphed
|
||||
|
||||
cdef public float sentiment
|
||||
|
||||
cdef public dict user_hooks
|
||||
|
@ -74,5 +67,3 @@ cdef class Doc:
|
|||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
|
||||
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
||||
cdef void set_parse(self, const TokenC* parsed) nogil
|
||||
|
|
|
@ -1,37 +1,34 @@
|
|||
# cython: infer_types=True, bounds_check=False, profile=True
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.string cimport memcpy
|
||||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
||||
import copy
|
||||
from collections import Counter
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
import struct
|
||||
import srsly
|
||||
from thinc.api import get_array_module
|
||||
from thinc.util import copy_array
|
||||
import warnings
|
||||
import copy
|
||||
|
||||
from .span cimport Span
|
||||
from .token cimport Token
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, NORM
|
||||
|
||||
from ..attrs import intify_attr, intify_attrs, IDS
|
||||
from ..util import normalize_slice
|
||||
from ..attrs import intify_attr, IDS
|
||||
from ..compat import copy_reg, pickle
|
||||
from ..errors import Errors, Warnings
|
||||
from ..morphology import Morphology
|
||||
from .. import util
|
||||
from .underscore import Underscore, get_ext_args
|
||||
from ._retokenize import Retokenizer
|
||||
from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS
|
||||
|
||||
|
||||
DEF PADDING = 5
|
||||
|
@ -190,8 +187,6 @@ cdef class Doc:
|
|||
self.c = data_start + PADDING
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
self.sentiment = 0.0
|
||||
self.cats = {}
|
||||
self.user_hooks = {}
|
||||
|
@ -221,11 +216,6 @@ cdef class Doc:
|
|||
else:
|
||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||
self.push_back(lexeme, has_space)
|
||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
||||
# There's no information we'd like to add to it, so I guess so?
|
||||
if self.length == 0:
|
||||
self.is_tagged = True
|
||||
self.is_parsed = True
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
|
@ -233,37 +223,61 @@ cdef class Doc:
|
|||
return Underscore(Underscore.doc_extensions, self)
|
||||
|
||||
@property
|
||||
def is_sentenced(self):
|
||||
"""Check if the document has sentence boundaries assigned. This is
|
||||
defined as having at least one of the following:
|
||||
def is_tagged(self):
|
||||
warnings.warn(Warnings.W107.format(prop="is_tagged", attr="TAG"), DeprecationWarning)
|
||||
return self.has_annotation("TAG")
|
||||
|
||||
a) An entry "sents" in doc.user_hooks";
|
||||
b) Doc.is_parsed is set to True;
|
||||
c) At least one token other than the first where sent_start is not None.
|
||||
"""
|
||||
if "sents" in self.user_hooks:
|
||||
return True
|
||||
if self.is_parsed:
|
||||
return True
|
||||
if len(self) < 2:
|
||||
return True
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == -1 or self.c[i].sent_start == 1:
|
||||
return True
|
||||
return False
|
||||
@property
|
||||
def is_parsed(self):
|
||||
warnings.warn(Warnings.W107.format(prop="is_parsed", attr="DEP"), DeprecationWarning)
|
||||
return self.has_annotation("DEP")
|
||||
|
||||
@property
|
||||
def is_nered(self):
|
||||
"""Check if the document has named entities set. Will return True if
|
||||
*any* of the tokens has a named entity tag set (even if the others are
|
||||
unknown values), or if the document is empty.
|
||||
warnings.warn(Warnings.W107.format(prop="is_nered", attr="ENT_IOB"), DeprecationWarning)
|
||||
return self.has_annotation("ENT_IOB")
|
||||
|
||||
@property
|
||||
def is_sentenced(self):
|
||||
warnings.warn(Warnings.W107.format(prop="is_sentenced", attr="SENT_START"), DeprecationWarning)
|
||||
return self.has_annotation("SENT_START")
|
||||
|
||||
def has_annotation(self, attr, *, require_complete=False):
|
||||
"""Check whether the doc contains annotation on a token attribute.
|
||||
|
||||
attr (Union[int, str]): The attribute string name or int ID.
|
||||
require_complete (bool): Whether to check that the attribute is set on
|
||||
every token in the doc.
|
||||
RETURNS (bool): Whether annotation is present.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/doc#has_annotation
|
||||
"""
|
||||
if len(self) == 0:
|
||||
|
||||
# empty docs are always annotated
|
||||
if self.length == 0:
|
||||
return True
|
||||
for i in range(self.length):
|
||||
if self.c[i].ent_iob != 0:
|
||||
cdef int i
|
||||
cdef int range_start = 0
|
||||
attr = intify_attr(attr)
|
||||
# adjust attributes
|
||||
if attr == HEAD:
|
||||
# HEAD does not have an unset state, so rely on DEP
|
||||
attr = DEP
|
||||
elif attr == self.vocab.strings["IS_SENT_START"]:
|
||||
# as in Matcher, allow IS_SENT_START as an alias of SENT_START
|
||||
attr = SENT_START
|
||||
# special cases for sentence boundaries
|
||||
if attr == SENT_START:
|
||||
if "sents" in self.user_hooks:
|
||||
return True
|
||||
return False
|
||||
# docs of length 1 always have sentence boundaries
|
||||
if self.length == 1:
|
||||
return True
|
||||
range_start = 1
|
||||
if require_complete:
|
||||
return all(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
||||
else:
|
||||
return any(Token.get_struct_attr(&self.c[i], attr) for i in range(range_start, self.length))
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
@ -291,7 +305,7 @@ cdef class Doc:
|
|||
DOCS: https://nightly.spacy.io/api/doc#getitem
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
start, stop = util.normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
return Span(self, start, stop, label=0)
|
||||
if i < 0:
|
||||
i = self.length + i
|
||||
|
@ -627,16 +641,13 @@ cdef class Doc:
|
|||
@property
|
||||
def sents(self):
|
||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||
objects. Sentence spans have no label. To improve accuracy on informal
|
||||
texts, spaCy calculates sentence boundaries from the syntactic
|
||||
dependency parse. If the parser is disabled, the `sents` iterator will
|
||||
be unavailable.
|
||||
objects. Sentence spans have no label.
|
||||
|
||||
YIELDS (Span): Sentences in the document.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/doc#sents
|
||||
"""
|
||||
if not self.is_sentenced:
|
||||
if not self.has_annotation("SENT_START"):
|
||||
raise ValueError(Errors.E030)
|
||||
if "sents" in self.user_hooks:
|
||||
yield from self.user_hooks["sents"](self)
|
||||
|
@ -660,10 +671,6 @@ cdef class Doc:
|
|||
return self.vocab.lang
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||
if self.length == 0:
|
||||
# Flip these to false when we see the first token.
|
||||
self.is_tagged = False
|
||||
self.is_parsed = False
|
||||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
cdef TokenC* t = &self.c[self.length]
|
||||
|
@ -786,14 +793,6 @@ cdef class Doc:
|
|||
for i in range(self.length, self.max_length + PADDING):
|
||||
self.c[i].lex = &EMPTY_LEXEME
|
||||
|
||||
cdef void set_parse(self, const TokenC* parsed) nogil:
|
||||
# TODO: This method is fairly misleading atm. It's used by Parser
|
||||
# to actually apply the parse calculated. Need to rethink this.
|
||||
# Probably we should use from_array?
|
||||
self.is_parsed = True
|
||||
for i in range(self.length):
|
||||
self.c[i] = parsed[i]
|
||||
|
||||
def from_array(self, attrs, array):
|
||||
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||
`(M, N)` array of attributes.
|
||||
|
@ -818,8 +817,8 @@ cdef class Doc:
|
|||
if array.dtype != numpy.uint64:
|
||||
warnings.warn(Warnings.W028.format(type=array.dtype))
|
||||
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(Errors.E032)
|
||||
if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
|
||||
warnings.warn(Warnings.W106)
|
||||
cdef int i, col
|
||||
cdef int32_t abs_head_index
|
||||
cdef attr_id_t attr_id
|
||||
|
@ -879,18 +878,17 @@ cdef class Doc:
|
|||
# add morph to morphology table
|
||||
self.vocab.morphology.add(self.vocab.strings[value])
|
||||
Token.set_struct_attr(token, attr_ids[j], value)
|
||||
# Set flags
|
||||
self.is_parsed = bool(self.is_parsed or HEAD in attrs)
|
||||
self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)
|
||||
# If document is parsed, set children
|
||||
if self.is_parsed:
|
||||
set_children_from_heads(self.c, length)
|
||||
# If document is parsed, set children and sentence boundaries
|
||||
if HEAD in attrs and DEP in attrs:
|
||||
col = attrs.index(DEP)
|
||||
if array[:, col].any():
|
||||
set_children_from_heads(self.c, 0, length)
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def from_docs(docs, ensure_whitespace=True, attrs=None):
|
||||
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
|
||||
the same `Vocab`.
|
||||
"""Concatenate multiple Doc objects to form a new one. Raises an error
|
||||
if the `Doc` objects do not all share the same `Vocab`.
|
||||
|
||||
docs (list): A list of Doc objects.
|
||||
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
|
||||
|
@ -908,16 +906,7 @@ cdef class Doc:
|
|||
(vocab,) = vocab
|
||||
|
||||
if attrs is None:
|
||||
attrs = [LEMMA, NORM]
|
||||
if all(doc.is_nered for doc in docs):
|
||||
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
|
||||
# TODO: separate for is_morphed?
|
||||
if all(doc.is_tagged for doc in docs):
|
||||
attrs.extend([TAG, POS, MORPH])
|
||||
if all(doc.is_parsed for doc in docs):
|
||||
attrs.extend([HEAD, DEP])
|
||||
else:
|
||||
attrs.append(SENT_START)
|
||||
attrs = Doc._get_array_attrs()
|
||||
else:
|
||||
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
|
||||
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
|
||||
|
@ -989,9 +978,6 @@ cdef class Doc:
|
|||
other.tensor = copy.deepcopy(self.tensor)
|
||||
other.cats = copy.deepcopy(self.cats)
|
||||
other.user_data = copy.deepcopy(self.user_data)
|
||||
other.is_tagged = self.is_tagged
|
||||
other.is_parsed = self.is_parsed
|
||||
other.is_morphed = self.is_morphed
|
||||
other.sentiment = self.sentiment
|
||||
other.has_unknown_spaces = self.has_unknown_spaces
|
||||
other.user_hooks = dict(self.user_hooks)
|
||||
|
@ -1065,22 +1051,16 @@ cdef class Doc:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/doc#to_bytes
|
||||
"""
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, NORM, ENT_KB_ID]
|
||||
if self.is_tagged:
|
||||
array_head.extend([TAG, POS])
|
||||
# If doc parsed add head and dep attribute
|
||||
if self.is_parsed:
|
||||
array_head.extend([HEAD, DEP])
|
||||
# Otherwise add sent_start
|
||||
else:
|
||||
array_head.append(SENT_START)
|
||||
array_head = Doc._get_array_attrs()
|
||||
strings = set()
|
||||
for token in self:
|
||||
strings.add(token.tag_)
|
||||
strings.add(token.lemma_)
|
||||
strings.add(token.morph_)
|
||||
strings.add(token.dep_)
|
||||
strings.add(token.ent_type_)
|
||||
strings.add(token.ent_kb_id_)
|
||||
strings.add(token.ent_id_)
|
||||
strings.add(token.norm_)
|
||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||
# vexing for user data. As a best guess, we *know* that within
|
||||
|
@ -1230,22 +1210,29 @@ cdef class Doc:
|
|||
DOCS: https://nightly.spacy.io/api/doc#to_json
|
||||
"""
|
||||
data = {"text": self.text}
|
||||
if self.is_nered:
|
||||
if self.has_annotation("ENT_IOB"):
|
||||
data["ents"] = [{"start": ent.start_char, "end": ent.end_char,
|
||||
"label": ent.label_} for ent in self.ents]
|
||||
if self.is_sentenced:
|
||||
if self.has_annotation("SENT_START"):
|
||||
sents = list(self.sents)
|
||||
data["sents"] = [{"start": sent.start_char, "end": sent.end_char}
|
||||
for sent in sents]
|
||||
if self.cats:
|
||||
data["cats"] = self.cats
|
||||
data["tokens"] = []
|
||||
attrs = ["TAG", "MORPH", "POS", "LEMMA", "DEP"]
|
||||
include_annotation = {attr: self.has_annotation(attr) for attr in attrs}
|
||||
for token in self:
|
||||
token_data = {"id": token.i, "start": token.idx, "end": token.idx + len(token)}
|
||||
if self.is_tagged:
|
||||
token_data["pos"] = token.pos_
|
||||
if include_annotation["TAG"]:
|
||||
token_data["tag"] = token.tag_
|
||||
if self.is_parsed:
|
||||
if include_annotation["POS"]:
|
||||
token_data["pos"] = token.pos_
|
||||
if include_annotation["MORPH"]:
|
||||
token_data["morph"] = token.morph_
|
||||
if include_annotation["LEMMA"]:
|
||||
token_data["lemma"] = token.lemma_
|
||||
if include_annotation["DEP"]:
|
||||
token_data["dep"] = token.dep_
|
||||
token_data["head"] = token.head.i
|
||||
data["tokens"].append(token_data)
|
||||
|
@ -1291,6 +1278,12 @@ cdef class Doc:
|
|||
j += 1
|
||||
return output
|
||||
|
||||
@staticmethod
|
||||
def _get_array_attrs():
|
||||
attrs = [LENGTH, SPACY]
|
||||
attrs.extend(intify_attr(x) for x in DOCBIN_ALL_ATTRS)
|
||||
return tuple(attrs)
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||
cdef int i = token_by_char(tokens, length, start_char)
|
||||
|
@ -1321,13 +1314,13 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
|
|||
return mid
|
||||
return -1
|
||||
|
||||
|
||||
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||
# note: end is exclusive
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
cdef int i
|
||||
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||
for i in range(length):
|
||||
for i in range(start, end):
|
||||
tokens[i].l_kids = 0
|
||||
tokens[i].r_kids = 0
|
||||
tokens[i].l_edge = i
|
||||
|
@ -1341,38 +1334,40 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
|
|||
# without risking getting stuck in an infinite loop if something is
|
||||
# terribly malformed.
|
||||
while not heads_within_sents:
|
||||
heads_within_sents = _set_lr_kids_and_edges(tokens, length, loop_count)
|
||||
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
|
||||
if loop_count > 10:
|
||||
warnings.warn(Warnings.W026)
|
||||
break
|
||||
loop_count += 1
|
||||
# Set sentence starts
|
||||
for i in range(length):
|
||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||
tokens[tokens[i].l_edge].sent_start = True
|
||||
for i in range(start, end):
|
||||
tokens[i].sent_start = -1
|
||||
for i in range(start, end):
|
||||
if tokens[i].head == 0:
|
||||
tokens[tokens[i].l_edge].sent_start = 1
|
||||
|
||||
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) except -1:
|
||||
cdef int _set_lr_kids_and_edges(TokenC* tokens, int start, int end, int loop_count) except -1:
|
||||
# May be called multiple times due to non-projectivity. See issues #3170
|
||||
# and #4688.
|
||||
# Set left edges
|
||||
cdef TokenC* head
|
||||
cdef TokenC* child
|
||||
cdef int i, j
|
||||
for i in range(length):
|
||||
for i in range(start, end):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child < head and loop_count == 0:
|
||||
if loop_count == 0 and child < head:
|
||||
head.l_kids += 1
|
||||
if child.l_edge < head.l_edge:
|
||||
head.l_edge = child.l_edge
|
||||
if child.r_edge > head.r_edge:
|
||||
head.r_edge = child.r_edge
|
||||
# Set right edges - same as above, but iterate in reverse
|
||||
for i in range(length-1, -1, -1):
|
||||
for i in range(end-1, start-1, -1):
|
||||
child = &tokens[i]
|
||||
head = &tokens[i + child.head]
|
||||
if child > head and loop_count == 0:
|
||||
if loop_count == 0 and child > head:
|
||||
head.r_kids += 1
|
||||
if child.r_edge > head.r_edge:
|
||||
head.r_edge = child.r_edge
|
||||
|
@ -1380,14 +1375,14 @@ cdef int _set_lr_kids_and_edges(TokenC* tokens, int length, int loop_count) exce
|
|||
head.l_edge = child.l_edge
|
||||
# Get sentence start positions according to current state
|
||||
sent_starts = set()
|
||||
for i in range(length):
|
||||
if tokens[i].head == 0 and tokens[i].dep != 0:
|
||||
for i in range(start, end):
|
||||
if tokens[i].head == 0:
|
||||
sent_starts.add(tokens[i].l_edge)
|
||||
cdef int curr_sent_start = 0
|
||||
cdef int curr_sent_end = 0
|
||||
# Check whether any heads are not within the current sentence
|
||||
for i in range(length):
|
||||
if (i > 0 and i in sent_starts) or i == length - 1:
|
||||
for i in range(start, end):
|
||||
if (i > 0 and i in sent_starts) or i == end - 1:
|
||||
curr_sent_end = i
|
||||
for j in range(curr_sent_start, curr_sent_end):
|
||||
if tokens[j].head + j < curr_sent_start or tokens[j].head + j >= curr_sent_end + 1:
|
||||
|
@ -1436,6 +1431,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
|||
with shape (n, n), where n = len(doc).
|
||||
"""
|
||||
cdef int [:,:] lca_matrix
|
||||
cdef int j, k
|
||||
n_tokens= end - start
|
||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||
lca_mat.fill(-1)
|
||||
|
|
|
@ -4,13 +4,10 @@ cimport numpy as np
|
|||
from libc.math cimport sqrt
|
||||
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
from thinc.api import get_array_module
|
||||
from collections import defaultdict
|
||||
import warnings
|
||||
|
||||
from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
|
||||
from .token cimport TokenC
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -204,7 +201,7 @@ cdef class Span:
|
|||
return Underscore(Underscore.span_extensions, self,
|
||||
start=self.start_char, end=self.end_char)
|
||||
|
||||
def as_doc(self, bint copy_user_data=False):
|
||||
def as_doc(self, *, bint copy_user_data=False):
|
||||
"""Create a `Doc` object with a copy of the `Span`'s data.
|
||||
|
||||
copy_user_data (bool): Whether or not to copy the original doc's user data.
|
||||
|
@ -212,19 +209,10 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#as_doc
|
||||
"""
|
||||
# TODO: make copy_user_data a keyword-only argument (Python 3 only)
|
||||
words = [t.text for t in self]
|
||||
spaces = [bool(t.whitespace_) for t in self]
|
||||
cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces)
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_ID, ENT_KB_ID]
|
||||
if self.doc.is_tagged:
|
||||
array_head.append(TAG)
|
||||
# If doc parsed add head and dep attribute
|
||||
if self.doc.is_parsed:
|
||||
array_head.extend([HEAD, DEP])
|
||||
# Otherwise add sent_start
|
||||
else:
|
||||
array_head.append(SENT_START)
|
||||
array_head = self.doc._get_array_attrs()
|
||||
array = self.doc.to_array(array_head)
|
||||
array = array[self.start : self.end]
|
||||
self._fix_dep_copy(array_head, array)
|
||||
|
@ -378,7 +366,7 @@ cdef class Span:
|
|||
self.doc.sents
|
||||
# Use `sent_start` token attribute to find sentence boundaries
|
||||
cdef int n = 0
|
||||
if self.doc.is_sentenced:
|
||||
if self.doc.has_annotation("SENT_START"):
|
||||
# Find start of the sentence
|
||||
start = self.start
|
||||
while self.doc.c[start].sent_start != 1 and start > 0:
|
||||
|
@ -510,8 +498,6 @@ cdef class Span:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/span#noun_chunks
|
||||
"""
|
||||
if not self.doc.is_parsed:
|
||||
raise ValueError(Errors.E029)
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
# cython: infer_types=True
|
||||
from libc.string cimport memcpy
|
||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||
from cython.view cimport array as cvarray
|
||||
cimport numpy as np
|
||||
|
@ -14,14 +12,13 @@ from ..typedefs cimport hash_t
|
|||
from ..lexeme cimport Lexeme
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, IS_CURRENCY, IS_STOP
|
||||
from ..attrs cimport LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..symbols cimport conj
|
||||
from .morphanalysis cimport MorphAnalysis
|
||||
from .doc cimport set_children_from_heads
|
||||
|
||||
from .. import parts_of_speech
|
||||
from .. import util
|
||||
from ..errors import Errors, Warnings
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
|
@ -489,7 +486,7 @@ cdef class Token:
|
|||
return True
|
||||
|
||||
def __set__(self, value):
|
||||
if self.doc.is_parsed:
|
||||
if self.doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E043)
|
||||
if value is None:
|
||||
self.c.sent_start = 0
|
||||
|
@ -658,78 +655,19 @@ cdef class Token:
|
|||
# Do nothing if old head is new head
|
||||
if self.i + self.c.head == new_head.i:
|
||||
return
|
||||
cdef Token old_head = self.head
|
||||
cdef int rel_newhead_i = new_head.i - self.i
|
||||
# Is the new head a descendant of the old head
|
||||
cdef bint is_desc = old_head.is_ancestor(new_head)
|
||||
cdef int new_edge
|
||||
cdef Token anc, child
|
||||
# Update number of deps of old head
|
||||
if self.c.head > 0: # left dependent
|
||||
old_head.c.l_kids -= 1
|
||||
if self.c.l_edge == old_head.c.l_edge:
|
||||
# The token dominates the left edge so the left edge of
|
||||
# the head may change when the token is reattached, it may
|
||||
# not change if the new head is a descendant of the current
|
||||
# head.
|
||||
new_edge = self.c.l_edge
|
||||
# The new l_edge is the left-most l_edge on any of the
|
||||
# other dependents where the l_edge is left of the head,
|
||||
# otherwise it is the head
|
||||
if not is_desc:
|
||||
new_edge = old_head.i
|
||||
for child in old_head.children:
|
||||
if child == self:
|
||||
continue
|
||||
if child.c.l_edge < new_edge:
|
||||
new_edge = child.c.l_edge
|
||||
old_head.c.l_edge = new_edge
|
||||
# Walk up the tree from old_head and assign new l_edge to
|
||||
# ancestors until an ancestor already has an l_edge that's
|
||||
# further left
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.l_edge <= new_edge:
|
||||
break
|
||||
anc.c.l_edge = new_edge
|
||||
elif self.c.head < 0: # right dependent
|
||||
old_head.c.r_kids -= 1
|
||||
# Do the same thing as for l_edge
|
||||
if self.c.r_edge == old_head.c.r_edge:
|
||||
new_edge = self.c.r_edge
|
||||
if not is_desc:
|
||||
new_edge = old_head.i
|
||||
for child in old_head.children:
|
||||
if child == self:
|
||||
continue
|
||||
if child.c.r_edge > new_edge:
|
||||
new_edge = child.c.r_edge
|
||||
old_head.c.r_edge = new_edge
|
||||
for anc in old_head.ancestors:
|
||||
if anc.c.r_edge >= new_edge:
|
||||
break
|
||||
anc.c.r_edge = new_edge
|
||||
# Update number of deps of new head
|
||||
if rel_newhead_i > 0: # left dependent
|
||||
new_head.c.l_kids += 1
|
||||
# Walk up the tree from new head and set l_edge to self.l_edge
|
||||
# until you hit a token with an l_edge further to the left
|
||||
if self.c.l_edge < new_head.c.l_edge:
|
||||
new_head.c.l_edge = self.c.l_edge
|
||||
for anc in new_head.ancestors:
|
||||
if anc.c.l_edge <= self.c.l_edge:
|
||||
break
|
||||
anc.c.l_edge = self.c.l_edge
|
||||
elif rel_newhead_i < 0: # right dependent
|
||||
new_head.c.r_kids += 1
|
||||
# Do the same as for l_edge
|
||||
if self.c.r_edge > new_head.c.r_edge:
|
||||
new_head.c.r_edge = self.c.r_edge
|
||||
for anc in new_head.ancestors:
|
||||
if anc.c.r_edge >= self.c.r_edge:
|
||||
break
|
||||
anc.c.r_edge = self.c.r_edge
|
||||
# Find the widest l/r_edges of the roots of the two tokens involved
|
||||
# to limit the number of tokens for set_children_from_heads
|
||||
cdef Token self_root, new_head_root
|
||||
self_ancestors = list(self.ancestors)
|
||||
new_head_ancestors = list(new_head.ancestors)
|
||||
self_root = self_ancestors[-1] if self_ancestors else self
|
||||
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
|
||||
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
|
||||
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
|
||||
# Set new head
|
||||
self.c.head = rel_newhead_i
|
||||
self.c.head = new_head.i - self.i
|
||||
# Adjust parse properties and sentence starts
|
||||
set_children_from_heads(self.doc.c, start, end + 1)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
|
|
|
@ -212,8 +212,6 @@ def doc_from_conllu_sentence(
|
|||
doc[i]._.merged_spaceafter = spaces[i]
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
|
||||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
@ -243,8 +241,6 @@ def doc_from_conllu_sentence(
|
|||
doc_x[i].dep_ = deps[i]
|
||||
doc_x[i].head = doc_x[heads[i]]
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
doc_x.is_parsed = True
|
||||
doc_x.is_tagged = True
|
||||
|
||||
return doc_x
|
||||
|
||||
|
|
|
@ -33,19 +33,25 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||
if doc.is_tagged:
|
||||
if include_annotation["TAG"]:
|
||||
json_token["tag"] = token.tag_
|
||||
if include_annotation["POS"]:
|
||||
json_token["pos"] = token.pos_
|
||||
if include_annotation["MORPH"]:
|
||||
json_token["morph"] = token.morph_
|
||||
if include_annotation["LEMMA"]:
|
||||
json_token["lemma"] = token.lemma_
|
||||
if doc.is_parsed:
|
||||
if include_annotation["DEP"]:
|
||||
json_token["head"] = token.head.i-token.i
|
||||
json_token["dep"] = token.dep_
|
||||
json_token["ner"] = biluo_tags[token.i]
|
||||
if include_annotation["ENT_IOB"]:
|
||||
json_token["ner"] = biluo_tags[token.i]
|
||||
json_sent["tokens"].append(json_token)
|
||||
json_para["sentences"].append(json_sent)
|
||||
json_doc["paragraphs"].append(json_para)
|
||||
|
|
|
@ -267,6 +267,17 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
| ----------- | -------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
||||
|
||||
## Doc.has_annotation {#has_annotation tag="method"}
|
||||
|
||||
Check whether the doc contains annotation on a token attribute.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------- |
|
||||
| `attr` | The attribute string name or int ID. ~~Union[int, str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `require_complete` | Whether to check that the attribute is set on every token in the doc. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | Whether specified annotation is present in the doc. ~~bool~~ |
|
||||
|
||||
## Doc.to_array {#to_array tag="method"}
|
||||
|
||||
Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence
|
||||
|
@ -609,26 +620,22 @@ The L2 norm of the document's vector representation.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | A string representation of the document text. ~~str~~ |
|
||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||
| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
|
||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | A string representation of the document text. ~~str~~ |
|
||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -410,6 +410,7 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||
| [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||
| [`Doc.has_annotation`](/api/doc#has_annotation) | Check whether a doc has annotation on a token attribute. |
|
||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||
| [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||
|
@ -805,6 +806,25 @@ nlp = spacy.blank("en")
|
|||
+ ruler.load_from_tag_map(YOUR_TAG_MAP)
|
||||
```
|
||||
|
||||
### Migrating Doc flags {#migrating-doc-flags}
|
||||
|
||||
The `Doc` flags `Doc.is_tagged`, `Doc.is_parsed`, `Doc.is_nered` and
|
||||
`Doc.is_sentenced` are deprecated in v3 and replaced by
|
||||
[`Doc.has_annotation`](/api/doc#has_annotation) method, which refers to the
|
||||
token attribute symbols (the same symbols used in `Matcher` patterns):
|
||||
|
||||
```diff
|
||||
doc = nlp(text)
|
||||
- doc.is_parsed
|
||||
+ doc.has_annotation("DEP")
|
||||
- doc.is_tagged
|
||||
+ doc.has_annotation("TAG")
|
||||
- doc.is_sentenced
|
||||
+ doc.has_annotation("SENT_START")
|
||||
- doc.is_nered
|
||||
+ doc.has_annotation("ENT_IOB")
|
||||
```
|
||||
|
||||
### Training pipelines and models {#migrating-training}
|
||||
|
||||
To train your pipelines, you should now pretty much always use the
|
||||
|
|
Loading…
Reference in New Issue
Block a user