Merge branch 'develop' into feature/spacy-legacy

This commit is contained in:
Ines Montani 2021-01-18 11:43:39 +11:00
commit 1090d3d675
30 changed files with 29035 additions and 28725 deletions

View File

@ -463,12 +463,14 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. " "If you're using a custom function, make sure the code is available. "
"If the function is provided by a third-party package, e.g. " "If the function is provided by a third-party package, e.g. "
"spacy-transformers, make sure the package is installed in your " "spacy-transformers, make sure the package is installed in your "
"environment.\n\nAvailable names: {available}") "environment.\n\nAvailable names: {available}")
E894 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
"'{lang}'.")
E895 = ("The 'textcat' component received gold-standard annotations with " E895 = ("The 'textcat' component received gold-standard annotations with "
"multiple labels per document. In spaCy 3 you should use the " "multiple labels per document. In spaCy 3 you should use the "
"'textcat_multilabel' component for this instead. " "'textcat_multilabel' component for this instead. "

View File

@ -86,7 +86,7 @@ def like_num(text):
if text in _num_words: if text in _num_words:
return True return True
# CHeck ordinal number # Check ordinal number
if text in _ordinal_words: if text in _ordinal_words:
return True return True
return False return False

View File

@ -18,8 +18,6 @@ class MacedonianLemmatizer(Lemmatizer):
string = string[:-3] string = string[:-3]
univ_pos = "verb" univ_pos = "verb"
if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
return [string.lower()]
index_table = self.lookups.get_table("lemma_index", {}) index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {}) exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {}) rules_table = self.lookups.get_table("lemma_rules", {})

View File

@ -697,6 +697,8 @@ class Language:
source_config = source.config.interpolate() source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name]) pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config self._pipe_configs[name] = pipe_config
for s in source.vocab.strings:
self.vocab.strings.add(s)
return pipe, pipe_config["factory"] return pipe, pipe_config["factory"]
def add_pipe( def add_pipe(
@ -1619,9 +1621,7 @@ class Language:
if model not in source_nlps: if model not in source_nlps:
# We only need the components here and we need to init # We only need the components here and we need to init
# model with the same vocab as the current nlp object # model with the same vocab as the current nlp object
source_nlps[model] = util.load_model( source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
model, vocab=nlp.vocab, disable=["vocab", "tokenizer"]
)
source_name = pipe_cfg.get("component", pipe_name) source_name = pipe_cfg.get("component", pipe_name)
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
disabled_pipes = [*config["nlp"]["disabled"], *disable] disabled_pipes = [*config["nlp"]["disabled"], *disable]

View File

@ -197,13 +197,39 @@ cdef class ArcEagerGold:
self.mem = Pool() self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True) heads, labels = example.get_aligned_parse(projectivize=True)
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
sent_starts = example.get_aligned_sent_starts() sent_starts = _get_aligned_sent_starts(example)
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
def update(self, StateClass stcls): def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c) update_gold_state(&self.c, stcls.c)
def _get_aligned_sent_starts(example):
"""Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values.
This function is slightly different from the one on Example, because we also
check whether the reference sentences align across multiple sentences,
and return missing values if they do. This prevents a problem where you have
the start of a sentence merged onto a token that belongs to two sentences.
"""
if example.y.has_annotation("SENT_START"):
align = example.alignment.y2x
sent_starts = [False] * len(example.x)
seen_words = set()
for y_sent in example.y.sents:
x_indices = list(align[y_sent.start : y_sent.end].dataXd)
if any(x_idx in seen_words for x_idx in x_indices):
# If there are any tokens in X that align across two sentences,
# regard the sentence annotations as missing, as we can't
# reliably use them.
return [None] * len(example.x)
seen_words.update(x_indices)
sent_starts[x_indices[0]] = True
return sent_starts
else:
return [None] * len(example.x)
cdef int check_state_gold(char state_bits, char flag) nogil: cdef int check_state_gold(char state_bits, char flag) nogil:
cdef char one = 1 cdef char one = 1
@ -820,7 +846,7 @@ cdef class ArcEager(TransitionSystem):
else: else:
failed = False failed = False
break break
if failed: if failed and _debug not in (False, None):
example = _debug example = _debug
print("Actions") print("Actions")
for i in range(self.n_moves): for i in range(self.n_moves):

View File

@ -1,7 +1,11 @@
import srsly
from thinc.api import Config
from typing import Dict, Any
from ..language import Language from ..language import Language
from ..matcher import Matcher from ..matcher import Matcher
from ..tokens import Doc from ..tokens import Doc
from ..util import filter_spans from ..util import filter_spans
from .. import util
@Language.component( @Language.component(
@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
for span in spans: for span in spans:
retokenizer.merge(span) retokenizer.merge(span)
return doc return doc
@Language.factory(
"token_splitter",
default_config={"min_length": 25, "split_length": 10},
retokenizes=True,
)
def make_token_splitter(
nlp: Language,
name: str,
*,
min_length=0,
split_length=0,
):
return TokenSplitter(
min_length=min_length, split_length=split_length
)
class TokenSplitter:
def __init__(self, min_length: int = 0, split_length: int = 0):
self.min_length = min_length
self.split_length = split_length
def __call__(self, doc: Doc) -> Doc:
if self.min_length > 0 and self.split_length > 0:
with doc.retokenize() as retokenizer:
for t in doc:
if len(t.text) >= self.min_length:
orths = []
heads = []
attrs = {}
for i in range(0, len(t.text), self.split_length):
orths.append(t.text[i : i + self.split_length])
heads.append((t, i / self.split_length))
retokenizer.split(t, orths, heads, attrs)
return doc
def _get_config(self) -> Dict[str, Any]:
return {
"min_length": self.min_length,
"split_length": self.split_length,
}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.min_length = config.get("min_length", 0)
self.split_length = config.get("split_length", 0)
def to_bytes(self, **kwargs):
serializers = {
"cfg": lambda: srsly.json_dumps(self._get_config()),
}
return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs):
deserializers = {
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
}
util.from_bytes(data, deserializers, [])
return self
def to_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = {
"cfg": lambda p: srsly.write_json(p, self._get_config()),
}
return util.to_disk(path, serializers, [])
def from_disk(self, path, **kwargs):
path = util.ensure_path(path)
serializers = {
"cfg": lambda p: self._set_config(srsly.read_json(p)),
}
util.from_disk(path, serializers, [])

View File

@ -145,6 +145,10 @@ class Morphologizer(Tagger):
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
# if both are unset, annotation is missing, so do not add
# an empty label
if pos == "" and not token.has_morph():
continue
morph = str(token.morph) morph = str(token.morph)
# create and add the combined morph+POS label # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
if norm_label not in self.cfg["labels_morph"]: if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
if len(self.labels) <= 1: if len(self.labels) < 1:
raise ValueError(Errors.E143.format(name=self.name)) raise ValueError(Errors.E143.format(name=self.name))
doc_sample = [] doc_sample = []
label_sample = [] label_sample = []
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
pos = pos_tags[i] pos = pos_tags[i]
morph = morphs[i] morph = morphs[i]
# POS may align (same value for multiple tokens) when morph # POS may align (same value for multiple tokens) when morph
# doesn't, so if either is None, treat both as None here so that # doesn't, so if either is misaligned (None), treat the
# truths doesn't end up with an unknown morph+POS combination # annotation as missing so that truths doesn't end up with an
# unknown morph+POS combination
if pos is None or morph is None: if pos is None or morph is None:
label = None label = None
# If both are unset, the annotation is missing (empty morph
# converted from int is "_" rather than "")
elif pos == "" and morph == "":
label = None
# Otherwise, generate the combined label
else: else:
label_dict = Morphology.feats_to_dict(morph) label_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
label_dict[self.POS_FEAT] = pos label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)] label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
# As a fail-safe, skip any unrecognized labels
if label not in self.labels:
label = None
eg_truths.append(label) eg_truths.append(label)
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)

View File

@ -2,6 +2,8 @@ import pytest
import numpy import numpy
import logging import logging
import mock import mock
from spacy.lang.xx import MultiLanguage
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lexeme import Lexeme from spacy.lexeme import Lexeme
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
doc.ents = spans doc.ents = spans
def test_doc_noun_chunks_not_implemented():
"""Test that a language without noun_chunk iterator, throws a NotImplementedError"""
text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
nlp = MultiLanguage()
doc = nlp(text)
with pytest.raises(NotImplementedError):
chunks = list(doc.noun_chunks)
def test_span_groups(en_tokenizer): def test_span_groups(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic") doc = en_tokenizer("Some text about Colombia and the Czech Republic")
doc.spans["hi"] = [Span(doc, 3, 4, label="bye")] doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]

View File

@ -1,11 +1,16 @@
import numpy
from spacy.attrs import HEAD, DEP
from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tokens import Doc from spacy.tokens import Doc
import pytest import pytest
@pytest.fixture
def doc(en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
heads = [1, 1, 6, 6, 3, 3, 1]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
def test_noun_chunks_is_parsed(en_tokenizer): def test_noun_chunks_is_parsed(en_tokenizer):
"""Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed.""" """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
doc = en_tokenizer("This is a sentence") doc = en_tokenizer("This is a sentence")
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
list(doc.noun_chunks) list(doc.noun_chunks)
def test_en_noun_chunks_not_nested(en_vocab): def test_en_noun_chunks_not_nested(doc, en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] """Test that each token only appears in one noun chunk at most"""
heads = [1, 1, 6, 6, 3, 3, 1]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
doc.from_array(
[HEAD, DEP],
numpy.asarray(
[
[1, nsubj],
[0, root],
[4, amod],
[3, nmod],
[-1, cc],
[-2, conj],
[-5, dobj],
],
dtype="uint64",
),
)
doc.noun_chunks_iterator = noun_chunks
word_occurred = {} word_occurred = {}
for chunk in doc.noun_chunks: chunks = list(doc.noun_chunks)
assert len(chunks) > 1
for chunk in chunks:
for word in chunk: for word in chunk:
word_occurred.setdefault(word.text, 0) word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1 word_occurred[word.text] += 1
assert len(word_occurred) > 0
for word, freq in word_occurred.items(): for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks]) assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
def test_noun_chunks_span(doc, en_tokenizer):
"""Test that the span.noun_chunks property works correctly"""
doc_chunks = list(doc.noun_chunks)
span = doc[0:3]
span_chunks = list(span.noun_chunks)
assert 0 < len(span_chunks) < len(doc_chunks)
for chunk in span_chunks:
assert chunk in doc_chunks
assert chunk.start >= 0
assert chunk.end <= 3

View File

@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
assert len(doc2) == 6 assert len(doc2) == 6
assert len(list(doc2.ents)) == 1 assert len(list(doc2.ents)) == 1
assert doc2[2].text == "New York" assert doc2[2].text == "New York"
def test_token_splitter():
nlp = Language()
config = {"min_length": 20, "split_length": 5}
token_splitter = nlp.add_pipe("token_splitter", config=config)
doc = nlp("aaaaabbbbbcccccdddd e f g")
assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
assert [t.text for t in doc] == [
"aaaaa",
"bbbbb",
"ccccc",
"ddddd",
"eeeee",
"ff",
"g",
"h",
"i",
]
assert all(len(t.text) <= token_splitter.split_length for t in doc)

View File

@ -136,3 +136,28 @@ def test_overfitting_IO():
gold_pos_tags = ["", "", "", ""] gold_pos_tags = ["", "", "", ""]
assert [str(t.morph) for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Test with unset morph and partial POS
nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer")
for example in train_examples:
for token in example.reference:
if token.text == "ham":
token.pos_ = "NOUN"
else:
token.pos_ = ""
token.set_morph(None)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
print(nlp.get_pipe("morphologizer").labels)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["morphologizer"] < 0.00001
# Test the trained model
test_text = "I like blue ham"
doc = nlp(test_text)
gold_morphs = ["", "", "", ""]
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags

View File

@ -81,7 +81,8 @@ def test_issue3199():
""" """
words = ["This", "is", "a", "sentence"] words = ["This", "is", "a", "sentence"]
doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
assert list(doc[0:3].noun_chunks) == [] with pytest.raises(NotImplementedError):
list(doc[0:3].noun_chunks)
def test_issue3209(): def test_issue3209():

View File

@ -816,8 +816,10 @@ cdef class Doc:
@property @property
def noun_chunks(self): def noun_chunks(self):
"""Iterate over the base noun phrases in the document. Yields base """Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
syntactically parsed. A base noun phrase, or "NP chunk", is a noun Raises a NotImplementedError otherwise.
A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
@ -826,14 +828,15 @@ cdef class Doc:
DOCS: https://nightly.spacy.io/api/doc#noun_chunks DOCS: https://nightly.spacy.io/api/doc#noun_chunks
""" """
if self.noun_chunks_iterator is None:
raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us # prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span # its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375. # objects. See Issue #375.
spans = [] spans = []
if self.noun_chunks_iterator is not None:
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
for span in spans: for span in spans:

View File

@ -487,29 +487,24 @@ cdef class Span:
""" """
return "".join([t.text_with_ws for t in self]) return "".join([t.text_with_ws for t in self])
@property @property
def noun_chunks(self): def noun_chunks(self):
"""Yields base noun-phrase `Span` objects, if the document has been """Iterate over the base noun phrases in the span. Yields base
syntactically parsed. A base noun phrase, or "NP chunk", is a noun noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
Raises a NotImplementedError otherwise.
A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Base noun-phrase `Span` objects. YIELDS (Span): Noun chunks in the span.
DOCS: https://nightly.spacy.io/api/span#noun_chunks DOCS: https://nightly.spacy.io/api/span#noun_chunks
""" """
# Accumulate the result before beginning to iterate over it. This for span in self.doc.noun_chunks:
# prevents the tokenisation from being changed out from under us if span.start >= self.start and span.end <= self.end:
# during the iteration. The tricky thing here is that Span accepts
# its tokenisation changing, so it's okay once we have the Span
# objects. See Issue #375
spans = []
cdef attr_t label
if self.doc.noun_chunks_iterator is not None:
for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self.doc, start, end, label=label))
for span in spans:
yield span yield span
@property @property

View File

@ -211,6 +211,14 @@ cdef class Token:
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
def has_morph(self):
"""Check whether the token has annotated morph information.
Return False when the morph annotation is unset/missing.
RETURNS (bool): Whether the morph annotation is set.
"""
return not self.c.morph == 0
property morph: property morph:
def __get__(self): def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)

View File

@ -200,10 +200,6 @@ cdef class Example:
def get_aligned_sent_starts(self): def get_aligned_sent_starts(self):
"""Get list of SENT_START attributes aligned to the predicted tokenization. """Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values. If the reference has not sentence starts, return a list of None values.
The aligned sentence starts use the get_aligned_spans method, rather
than aligning the list of tags, so that it handles cases where a mistaken
tokenization starts the sentence.
""" """
if self.y.has_annotation("SENT_START"): if self.y.has_annotation("SENT_START"):
align = self.alignment.y2x align = self.alignment.y2x

View File

@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
nested within it so no NP-level coordination, no prepositional phrases, and no nested within it so no NP-level coordination, no prepositional phrases, and no
relative clauses. relative clauses.
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
not been implemeted for the given language, a `NotImplementedError` is raised.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp("A phrase with another phrase occurs.") > doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks) > chunks = list(doc.noun_chunks)
> assert len(chunks) == 2
> assert chunks[0].text == "A phrase" > assert chunks[0].text == "A phrase"
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```

View File

@ -6,6 +6,7 @@ menu:
- ['merge_noun_chunks', 'merge_noun_chunks'] - ['merge_noun_chunks', 'merge_noun_chunks']
- ['merge_entities', 'merge_entities'] - ['merge_entities', 'merge_entities']
- ['merge_subtokens', 'merge_subtokens'] - ['merge_subtokens', 'merge_subtokens']
- ['token_splitter', 'token_splitter']
--- ---
## merge_noun_chunks {#merge_noun_chunks tag="function"} ## merge_noun_chunks {#merge_noun_chunks tag="function"}
@ -107,3 +108,25 @@ end of the pipeline and after all other components.
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ | | `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ | | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |
## token_splitter {#token_splitter tag="function" new="3.0"}
Split tokens longer than a minimum length into shorter tokens. Intended for use
with transformer pipelines where long spaCy tokens lead to input text that
exceed the transformer model max length. See
[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
> #### Example
>
> ```python
> config={"min_length": 20, "split_length": 5}
> nlp.add_pipe("token_splitter", config=config, first=True)
> doc = nlp("aaaaabbbbbcccccdddddee")
> print([token.text for token in doc])
> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
> ```
| Setting | Description |
| -------------- | --------------------------------------------------------------------- |
| `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ |

View File

@ -274,6 +274,31 @@ if the entity recognizer has been applied.
| ----------- | ----------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------- |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
objects, if the document has been syntactically parsed. A base noun phrase, or
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
it so no NP-level coordination, no prepositional phrases, and no relative
clauses.
If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
not been implemeted for the given language, a `NotImplementedError` is raised.
> #### Example
>
> ```python
> doc = nlp("A phrase with another phrase occurs.")
> span = doc[3:5]
> chunks = list(span.noun_chunks)
> assert len(chunks) == 1
> assert chunks[0].text == "another phrase"
> ```
| Name | Description |
| ---------- | --------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
## Span.as_doc {#as_doc tag="method"} ## Span.as_doc {#as_doc tag="method"}
Create a new `Doc` object corresponding to the `Span`, with a copy of the data. Create a new `Doc` object corresponding to the `Span`, with a copy of the data.

View File

@ -191,6 +191,15 @@ the morph to an unset state.
| -------- | --------------------------------------------------------------------------------- | | -------- | --------------------------------------------------------------------------------- |
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ | | features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
## Token.has_morph {#has_morph tag="method"}
Check whether the token has annotated morph information. Return `False` when the
morph annotation is unset/missing.
| Name | Description |
| ----------- | --------------------------------------------- |
| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
## Token.is_ancestor {#is_ancestor tag="method" model="parser"} ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
Check whether this token is a parent, grandparent, etc. of another in the Check whether this token is a parent, grandparent, etc. of another in the

View File

@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
provide a schedule, allowing you to freeze the shared parameters at the start of provide a schedule, allowing you to freeze the shared parameters at the start of
training. training.
### Managing transformer model max length limitations {#transformer-max-length}
Many transformer models have a limit on the maximum number of tokens that the
model can process, for example BERT models are limited to 512 tokens. This limit
refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
number of spaCy tokens.
To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
component uses [`span_getters`](/api/transformer#span_getters) to convert a
batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
transformer tokens than the transformer model supports, the spaCy pipeline can't
process the text because some spaCy tokens would be left without an analysis.
In general, it is up to the transformer pipeline user to manage the input texts
so that the model max length is not exceeded. If you're training a **new
pipeline**, you have a number of options to handle the max length limit:
- Use `doc_spans` with short texts only
- Use `sent_spans` with short sentences only
- For `strided_spans`, lower the `window` size to be short enough for your input
texts (and don't forget to lower the `stride` correspondingly)
- Implement a [custom span getter](#transformers-training-custom-settings)
You may still run into the max length limit if a single spaCy token is very
long, like a long URL or a noisy string, or if you're using a **pretrained
pipeline** like `en_core_web_trf` with a fixed `window` size for
`strided_spans`. In this case, you need to modify either your texts or your
pipeline so that you have shorter spaCy tokens. Some options:
- Preprocess your texts to clean up noise and split long tokens with whitespace
- Add a `token_splitter` to the beginning of your pipeline to break up
tokens that are longer than a specified length:
```python
config={"min_length": 20, "split_length": 5}
nlp.add_pipe("token_splitter", config=config, first=True)
```
In this example, tokens that are at least 20 characters long will be split up
into smaller tokens of 5 characters each, resulting in strided spans that
correspond to fewer transformer tokens.
## Static vectors {#static-vectors} ## Static vectors {#static-vectors}
If your pipeline includes a **word vectors table**, you'll be able to use the If your pipeline includes a **word vectors table**, you'll be able to use the

View File

@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" flat phrases that have a noun as their
head. You can think of noun chunks as a noun plus the words describing the noun head. You can think of noun chunks as a noun plus the words describing the noun
for example, "the lavish green grass" or "the worlds largest tech fund". To for example, "the lavish green grass" or "the worlds largest tech fund". To
get the noun chunks in a document, simply iterate over get the noun chunks in a document, simply iterate over
[`Doc.noun_chunks`](/api/doc#noun_chunks) [`Doc.noun_chunks`](/api/doc#noun_chunks).
```python ```python
### {executable="true"} ### {executable="true"}

View File

@ -2139,7 +2139,7 @@
"from negspacy.negation import Negex", "from negspacy.negation import Negex",
"", "",
"nlp = spacy.load(\"en_core_web_sm\")", "nlp = spacy.load(\"en_core_web_sm\")",
"negex = Negex(nlp, ent_types=[\"PERSON\",\"ORG\"])", "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])",
"nlp.add_pipe(negex, last=True)", "nlp.add_pipe(negex, last=True)",
"", "",
"doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",

15978
website/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@
"private": true, "private": true,
"description": "spaCy website", "description": "spaCy website",
"version": "3.0.0", "version": "3.0.0",
"author": "Explosion AI <contact@explosion.ai>", "author": "Explosion <contact@explosion.ai>",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@jupyterlab/outputarea": "^0.19.1", "@jupyterlab/outputarea": "^0.19.1",
@ -16,7 +16,7 @@
"autoprefixer": "^9.4.7", "autoprefixer": "^9.4.7",
"classnames": "^2.2.6", "classnames": "^2.2.6",
"codemirror": "^5.43.0", "codemirror": "^5.43.0",
"gatsby": "^2.1.18", "gatsby": "^2.11.1",
"gatsby-image": "^2.0.29", "gatsby-image": "^2.0.29",
"gatsby-mdx": "^0.3.6", "gatsby-mdx": "^0.3.6",
"gatsby-plugin-catch-links": "^2.0.11", "gatsby-plugin-catch-links": "^2.0.11",
@ -24,12 +24,14 @@
"gatsby-plugin-offline": "^2.0.24", "gatsby-plugin-offline": "^2.0.24",
"gatsby-plugin-plausible": "0.0.6", "gatsby-plugin-plausible": "0.0.6",
"gatsby-plugin-react-helmet": "^3.0.6", "gatsby-plugin-react-helmet": "^3.0.6",
"gatsby-plugin-react-svg": "^2.1.2", "gatsby-plugin-react-svg": "^2.0.0",
"gatsby-plugin-robots-txt": "^1.5.1",
"gatsby-plugin-sass": "^2.0.10", "gatsby-plugin-sass": "^2.0.10",
"gatsby-plugin-sharp": "^2.0.20", "gatsby-plugin-sharp": "^2.0.20",
"gatsby-plugin-sitemap": "^2.0.5", "gatsby-plugin-sitemap": "^2.0.5",
"gatsby-plugin-svgr": "^2.0.1", "gatsby-plugin-svgr": "^2.0.1",
"gatsby-remark-copy-linked-files": "^2.0.9", "gatsby-remark-copy-linked-files": "^2.0.9",
"gatsby-remark-find-replace": "^0.3.0",
"gatsby-remark-images": "^3.0.4", "gatsby-remark-images": "^3.0.4",
"gatsby-remark-prismjs": "^3.2.4", "gatsby-remark-prismjs": "^3.2.4",
"gatsby-remark-smartypants": "^2.0.8", "gatsby-remark-smartypants": "^2.0.8",
@ -39,9 +41,11 @@
"gatsby-transformer-sharp": "^2.1.13", "gatsby-transformer-sharp": "^2.1.13",
"html-to-react": "^1.3.4", "html-to-react": "^1.3.4",
"intersection-observer": "^0.5.1", "intersection-observer": "^0.5.1",
"jinja-to-js": "^3.2.3",
"node-sass": "^4.11.0", "node-sass": "^4.11.0",
"parse-numeric-range": "0.0.2", "parse-numeric-range": "0.0.2",
"prismjs": "^1.15.0", "prismjs": "^1.15.0",
"prismjs-bibtex": "^1.1.0",
"prop-types": "^15.7.2", "prop-types": "^15.7.2",
"react": "^16.8.2", "react": "^16.8.2",
"react-dom": "^16.8.2", "react-dom": "^16.8.2",
@ -50,19 +54,22 @@
"remark-react": "^5.0.1" "remark-react": "^5.0.1"
}, },
"scripts": { "scripts": {
"build": "gatsby build", "build": "npm run python:install && npm run python:setup && gatsby build",
"dev": "gatsby develop", "dev": "npm run python:setup && gatsby develop",
"dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
"lint": "eslint **", "lint": "eslint **",
"clear": "rm -rf .cache", "clear": "rm -rf .cache",
"test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"" "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
"python:install": "pip install -r setup/requirements.txt",
"python:setup": "cd setup && sh setup.sh"
}, },
"devDependencies": { "devDependencies": {
"@sindresorhus/slugify": "^0.8.0",
"browser-monads": "^1.0.0", "browser-monads": "^1.0.0",
"md-attr-parser": "^1.2.1", "md-attr-parser": "^1.2.1",
"prettier": "^1.16.4", "prettier": "^1.16.4",
"raw-loader": "^1.0.0", "raw-loader": "^1.0.0",
"unist-util-visit": "^1.4.0", "unist-util-visit": "^1.4.0"
"@sindresorhus/slugify": "^0.8.0"
}, },
"repository": { "repository": {
"type": "git", "type": "git",

View File

@ -6,7 +6,7 @@ import classNames from 'classnames'
import Link from './link' import Link from './link'
import Grid from './grid' import Grid from './grid'
import Newsletter from './newsletter' import Newsletter from './newsletter'
import ExplosionLogo from '-!svg-react-loader!../images/explosion.svg' import { ReactComponent as ExplosionLogo } from '../images/explosion.svg'
import classes from '../styles/footer.module.sass' import classes from '../styles/footer.module.sass'
export default function Footer({ wide = false }) { export default function Footer({ wide = false }) {

View File

@ -1,4 +1,4 @@
import React from 'react' import React, { Fragment } from 'react'
import PropTypes from 'prop-types' import PropTypes from 'prop-types'
import classNames from 'classnames' import classNames from 'classnames'
@ -19,7 +19,13 @@ import NoIcon from '-!svg-react-loader!../images/icons/no.svg'
import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg' import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg'
import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg' import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg'
import SearchIcon from '-!svg-react-loader!../images/icons/search.svg' import SearchIcon from '-!svg-react-loader!../images/icons/search.svg'
import MoonIcon from '-!svg-react-loader!../images/icons/moon.svg'
import ClipboardIcon from '-!svg-react-loader!../images/icons/clipboard.svg'
import NetworkIcon from '-!svg-react-loader!../images/icons/network.svg'
import DownloadIcon from '-!svg-react-loader!../images/icons/download.svg'
import PackageIcon from '-!svg-react-loader!../images/icons/package.svg'
import { isString } from './util'
import classes from '../styles/icon.module.sass' import classes from '../styles/icon.module.sass'
const icons = { const icons = {
@ -41,9 +47,22 @@ const icons = {
neutral: NeutralIcon, neutral: NeutralIcon,
offline: OfflineIcon, offline: OfflineIcon,
search: SearchIcon, search: SearchIcon,
moon: MoonIcon,
clipboard: ClipboardIcon,
network: NetworkIcon,
download: DownloadIcon,
package: PackageIcon,
} }
const Icon = ({ name, width, height, inline, variant, className }) => { export default function Icon({
name,
width = 20,
height,
inline = false,
variant,
className,
...props
}) {
const IconComponent = icons[name] const IconComponent = icons[name]
const iconClassNames = classNames(classes.root, className, { const iconClassNames = classNames(classes.root, className, {
[classes.inline]: inline, [classes.inline]: inline,
@ -57,15 +76,11 @@ const Icon = ({ name, width, height, inline, variant, className }) => {
aria-hidden="true" aria-hidden="true"
width={width} width={width}
height={height || width} height={height || width}
{...props}
/> />
) )
} }
Icon.defaultProps = {
width: 20,
inline: false,
}
Icon.propTypes = { Icon.propTypes = {
name: PropTypes.oneOf(Object.keys(icons)), name: PropTypes.oneOf(Object.keys(icons)),
width: PropTypes.number, width: PropTypes.number,
@ -75,4 +90,43 @@ Icon.propTypes = {
className: PropTypes.string, className: PropTypes.string,
} }
export default Icon export function replaceEmoji(cellChildren) {
const icons = {
'✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' },
'❌': { name: 'no', variant: 'error', 'aria-label': 'negative' },
}
const iconRe = new RegExp(`^(${Object.keys(icons).join('|')})`, 'g')
let children = isString(cellChildren) ? [cellChildren] : cellChildren
let hasIcon = false
if (Array.isArray(children)) {
children = children.map((child, i) => {
if (isString(child)) {
const icon = icons[child.trim()]
if (icon) {
hasIcon = true
return (
<Icon
{...icon}
inline={i < children.length}
aria-hidden={undefined}
key={i}
/>
)
} else if (iconRe.test(child)) {
hasIcon = true
const [, iconName, text] = child.split(iconRe)
return (
<Fragment key={i}>
<Icon {...icons[iconName]} aria-hidden={undefined} inline={true} />
{text.replace(/^\s+/g, '')}
</Fragment>
)
}
// Work around prettier auto-escape
if (child.startsWith('\\')) return child.slice(1)
}
return child
})
}
return { content: children, hasIcon }
}

View File

@ -6,7 +6,7 @@ import Link from './link'
import Icon from './icon' import Icon from './icon'
import Dropdown from './dropdown' import Dropdown from './dropdown'
import { github } from './util' import { github } from './util'
import Logo from '-!svg-react-loader!../images/logo.svg' import { ReactComponent as Logo } from '../images/logo.svg'
import classes from '../styles/navigation.module.sass' import classes from '../styles/navigation.module.sass'
const NavigationDropdown = ({ items = [], section }) => { const NavigationDropdown = ({ items = [], section }) => {

View File

@ -1,31 +0,0 @@
import AirbnbLogo from '-!svg-react-loader!./airbnb.svg'
import UberLogo from '-!svg-react-loader!./uber.svg'
import QuoraLogo from '-!svg-react-loader!./quora.svg'
import RetrieverLogo from '-!svg-react-loader!./retriever.svg'
import StitchfixLogo from '-!svg-react-loader!./stitchfix.svg'
import ChartbeatLogo from '-!svg-react-loader!./chartbeat.svg'
import AllenAILogo from '-!svg-react-loader!./allenai.svg'
import RecodeLogo from '-!svg-react-loader!./recode.svg'
import WapoLogo from '-!svg-react-loader!./wapo.svg'
import BBCLogo from '-!svg-react-loader!./bbc.svg'
import MicrosoftLogo from '-!svg-react-loader!./microsoft.svg'
import VenturebeatLogo from '-!svg-react-loader!./venturebeat.svg'
import ThoughtworksLogo from '-!svg-react-loader!./thoughtworks.svg'
export default {
airbnb: AirbnbLogo,
uber: UberLogo,
quora: QuoraLogo,
retriever: RetrieverLogo,
stitchfix: StitchfixLogo,
chartbeat: ChartbeatLogo,
allenai: AllenAILogo,
recode: RecodeLogo,
wapo: WapoLogo,
bbc: BBCLogo,
microsoft: MicrosoftLogo,
venturebeat: VenturebeatLogo,
thoughtworks: ThoughtworksLogo,
}

View File

@ -4,7 +4,7 @@ import Grid from '../components/grid'
import { Label } from '../components/typography' import { Label } from '../components/typography'
import Link from '../components/link' import Link from '../components/link'
import Logo from '-!svg-react-loader!../images/logo.svg' import { ReactComponent as Logo } from '../images/logo.svg'
import patternBlue from '../images/pattern_blue.jpg' import patternBlue from '../images/pattern_blue.jpg'
import patternGreen from '../images/pattern_green.jpg' import patternGreen from '../images/pattern_green.jpg'
import patternPurple from '../images/pattern_purple.jpg' import patternPurple from '../images/pattern_purple.jpg'