Merge remote-tracking branch 'upstream/develop' into fix/various

This commit is contained in:
svlandeg 2020-10-09 17:01:27 +02:00
commit 08cb085f6c
13 changed files with 58 additions and 65 deletions

View File

@ -456,6 +456,8 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
"frozen components, make sure they were already initialized and trained. ")
E901 = ("Failed to remove existing output directory: {path}. If your " E901 = ("Failed to remove existing output directory: {path}. If your "
"config and the components you train change between runs, a " "config and the components you train change between runs, a "
"non-empty output directory can lead to stale pipeline data. To " "non-empty output directory can lead to stale pipeline data. To "

View File

@ -1034,6 +1034,9 @@ class Language:
) )
) )
disable = to_disable disable = to_disable
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
# those pipes that were already disabled.
disable = [d for d in disable if d not in self._disabled]
return DisabledPipes(self, disable) return DisabledPipes(self, disable)
def make_doc(self, text: str) -> Doc: def make_doc(self, text: str) -> Doc:

View File

@ -177,7 +177,7 @@ def CharacterEmbed(
rows: int, rows: int,
nM: int, nM: int,
nC: int, nC: int,
also_use_static_vectors: bool, include_static_vectors: bool,
feature: Union[int, str] = "LOWER", feature: Union[int, str] = "LOWER",
) -> Model[List[Doc], List[Floats2d]]: ) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedded representation based on character embeddings, using """Construct an embedded representation based on character embeddings, using
@ -204,13 +204,13 @@ def CharacterEmbed(
nC (int): The number of UTF-8 bytes to embed per word. Recommended values nC (int): The number of UTF-8 bytes to embed per word. Recommended values
are between 3 and 8, although it may depend on the length of words in the are between 3 and 8, although it may depend on the length of words in the
language. language.
also_use_static_vectors (bool): Whether to also use static word vectors. include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab. Requires a vectors table to be loaded in the Doc objects' vocab.
""" """
feature = intify_attr(feature) feature = intify_attr(feature)
if feature is None: if feature is None:
raise ValueError(Errors.E911(feat=feature)) raise ValueError(Errors.E911(feat=feature))
if also_use_static_vectors: if include_static_vectors:
model = chain( model = chain(
concatenate( concatenate(
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),

View File

@ -32,7 +32,7 @@ width = 128
rows = 7000 rows = 7000
nM = 64 nM = 64
nC = 8 nC = 8
also_use_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1" @architectures = "spacy.MaxoutWindowEncoder.v1"

View File

@ -608,14 +608,11 @@ def test_doc_init_iob():
doc = Doc(Vocab(), words=words, ents=ents) doc = Doc(Vocab(), words=words, ents=ents)
@pytest.mark.xfail def test_doc_set_ents_invalid_spans(en_tokenizer):
def test_doc_set_ents_spans(en_tokenizer):
doc = en_tokenizer("Some text about Colombia and the Czech Republic") doc = en_tokenizer("Some text about Colombia and the Czech Republic")
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")] spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
for span in spans: for span in spans:
retokenizer.merge(span) retokenizer.merge(span)
# If this line is uncommented, it works: with pytest.raises(IndexError):
# print(spans)
doc.ents = spans doc.ents = spans
assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]

View File

@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
attrs = {"lemma": "none", "ent_type": "none"} attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs) retokenizer.merge(doc[0:2], attrs=attrs)
retokenizer.merge(doc[-2:], attrs=attrs) retokenizer.merge(doc[-2:], attrs=attrs)
sent1, sent2 = list(doc.sents)
assert len(sent1) == init_len - 1 assert len(sent1) == init_len - 1
assert len(sent2) == init_len2 - 1 assert len(sent2) == init_len2 - 1

View File

@ -129,6 +129,7 @@ def test_enable_pipes_method(nlp, name):
@pytest.mark.parametrize("name", ["my_component"]) @pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_context(nlp, name): def test_disable_pipes_context(nlp, name):
"""Test that an enabled component stays enabled after running the context manager."""
nlp.add_pipe("new_pipe", name=name) nlp.add_pipe("new_pipe", name=name)
assert nlp.has_pipe(name) assert nlp.has_pipe(name)
with nlp.select_pipes(disable=name): with nlp.select_pipes(disable=name):
@ -136,6 +137,18 @@ def test_disable_pipes_context(nlp, name):
assert nlp.has_pipe(name) assert nlp.has_pipe(name)
@pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_context_restore(nlp, name):
"""Test that a disabled component stays disabled after running the context manager."""
nlp.add_pipe("new_pipe", name=name)
assert nlp.has_pipe(name)
nlp.disable_pipes(name)
assert not nlp.has_pipe(name)
with nlp.select_pipes(disable=name):
assert not nlp.has_pipe(name)
assert not nlp.has_pipe(name)
def test_select_pipes_list_arg(nlp): def test_select_pipes_list_arg(nlp):
for name in ["c1", "c2", "c3"]: for name in ["c1", "c2", "c3"]:
nlp.add_pipe("new_pipe", name=name) nlp.add_pipe("new_pipe", name=name)

View File

@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
[ [
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
], ],
) )
# fmt: on # fmt: on

View File

@ -16,5 +16,4 @@ cdef class Span:
cdef public _vector cdef public _vector
cdef public _vector_norm cdef public _vector_norm
cpdef int _recalculate_indices(self) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -150,7 +150,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#len DOCS: https://nightly.spacy.io/api/span#len
""" """
self._recalculate_indices()
if self.end < self.start: if self.end < self.start:
return 0 return 0
return self.end - self.start return self.end - self.start
@ -167,7 +166,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#getitem DOCS: https://nightly.spacy.io/api/span#getitem
""" """
self._recalculate_indices()
if isinstance(i, slice): if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step) start, end = normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self.doc, start + self.start, end + self.start) return Span(self.doc, start + self.start, end + self.start)
@ -188,7 +186,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#iter DOCS: https://nightly.spacy.io/api/span#iter
""" """
self._recalculate_indices()
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self.doc[i] yield self.doc[i]
@ -339,19 +336,6 @@ cdef class Span:
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
return output return output
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
if self.start == -1:
raise IndexError(Errors.E036.format(start=self.start_char))
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
if end == -1:
raise IndexError(Errors.E037.format(end=self.end_char))
self.start = start
self.end = end + 1
@property @property
def vocab(self): def vocab(self):
"""RETURNS (Vocab): The Span's Doc's vocab.""" """RETURNS (Vocab): The Span's Doc's vocab."""
@ -520,7 +504,6 @@ cdef class Span:
DOCS: https://nightly.spacy.io/api/span#root DOCS: https://nightly.spacy.io/api/span#root
""" """
self._recalculate_indices()
if "root" in self.doc.user_span_hooks: if "root" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["root"](self) return self.doc.user_span_hooks["root"](self)
# This should probably be called 'head', and the other one called # This should probably be called 'head', and the other one called

View File

@ -249,7 +249,10 @@ def create_evaluation_callback(
def evaluate() -> Tuple[float, Dict[str, float]]: def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp)) dev_examples = list(dev_corpus(nlp))
try:
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
except KeyError as e:
raise KeyError(Errors.E900) from e
# Calculate a weighted sum based on score_weights for the main score. # Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like # We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc. # entity scores per type etc.

View File

@ -522,9 +522,9 @@ word vector tables using the `include_static_vectors` flag.
[tagger.model.tok2vec.embed] [tagger.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v1"
width = 128 width = 128
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
rows = [7000, 3500, 3500, 3500] rows = [5000,2500,2500,2500]
also_use_static_vectors = true include_static_vectors = true
``` ```
<Infobox title="How it works" emoji="💡"> <Infobox title="How it works" emoji="💡">

View File

@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
This example shows the implementation of a pipeline component that fetches This example shows the implementation of a pipeline component that fetches
country meta data via the [REST Countries API](https://restcountries.eu), sets country meta data via the [REST Countries API](https://restcountries.eu), sets
entity annotations for countries, merges entities into one token and sets custom entity annotations for countries and sets custom attributes on the `Doc` and
attributes on the `Doc`, `Span` and `Token` for example, the capital, `Span` for example, the capital, latitude/longitude coordinates and even the
latitude/longitude coordinates and even the country flag. country flag.
```python ```python
### {executable="true"} ### {executable="true"}
@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
# Set up the PhraseMatcher with Doc patterns for each country name # Set up the PhraseMatcher with Doc patterns for each country name
self.matcher = PhraseMatcher(nlp.vocab) self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()]) self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
# Register attribute on the Token. We'll be overwriting this based on # Register attributes on the Span. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter. # the matches, so we're only setting a default value, not a getter.
Token.set_extension("is_country", default=False) Span.set_extension("is_country", default=None)
Token.set_extension("country_capital", default=False) Span.set_extension("country_capital", default=None)
Token.set_extension("country_latlng", default=False) Span.set_extension("country_latlng", default=None)
Token.set_extension("country_flag", default=False) Span.set_extension("country_flag", default=None)
# Register attributes on Doc and Span via a getter that checks if one of # Register attribute on Doc via a getter that checks if the Doc
# the contained tokens is set to is_country == True. # contains a country entity
Doc.set_extension("has_country", getter=self.has_country) Doc.set_extension("has_country", getter=self.has_country)
Span.set_extension("has_country", getter=self.has_country)
def __call__(self, doc): def __call__(self, doc):
spans = [] # keep the spans for later so we can merge them afterwards spans = [] # keep the spans for later so we can merge them afterwards
for _, start, end in self.matcher(doc): for _, start, end in self.matcher(doc):
# Generate Span representing the entity & set label # Generate Span representing the entity & set label
entity = Span(doc, start, end, label=self.label) entity = Span(doc, start, end, label=self.label)
# Set custom attributes on entity. Can be extended with other data
# returned by the API, like currencies, country code, calling code etc.
entity._.set("is_country", True)
entity._.set("country_capital", self.countries[entity.text]["capital"])
entity._.set("country_latlng", self.countries[entity.text]["latlng"])
entity._.set("country_flag", self.countries[entity.text]["flag"])
spans.append(entity) spans.append(entity)
# Set custom attribute on each token of the entity
# Can be extended with other data returned by the API, like
# currencies, country code, flag, calling code etc.
for token in entity:
token._.set("is_country", True)
token._.set("country_capital", self.countries[entity.text]["capital"])
token._.set("country_latlng", self.countries[entity.text]["latlng"])
token._.set("country_flag", self.countries[entity.text]["flag"])
# Iterate over all spans and merge them into one token
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# Overwrite doc.ents and add entity be careful not to replace! # Overwrite doc.ents and add entity be careful not to replace!
doc.ents = list(doc.ents) + spans doc.ents = list(doc.ents) + spans
return doc # don't forget to return the Doc! return doc # don't forget to return the Doc!
def has_country(self, tokens): def has_country(self, doc):
"""Getter for Doc and Span attributes. Since the getter is only called """Getter for Doc attributes. Since the getter is only called
when we access the attribute, we can refer to the Token's 'is_country' when we access the attribute, we can refer to the Span's 'is_country'
attribute here, which is already set in the processing step.""" attribute here, which is already set in the processing step."""
return any([t._.get("is_country") for t in tokens]) return any([entity._.get("is_country") for entity in doc.ents])
nlp = English() nlp = English()
nlp.add_pipe("rest_countries", config={"label": "GPE"}) nlp.add_pipe("rest_countries", config={"label": "GPE"})
doc = nlp("Some text about Colombia and the Czech Republic") doc = nlp("Some text about Colombia and the Czech Republic")
print("Pipeline", nlp.pipe_names) # pipeline contains component name print("Pipeline", nlp.pipe_names) # pipeline contains component name
print("Doc has countries", doc._.has_country) # Doc contains countries print("Doc has countries", doc._.has_country) # Doc contains countries
for token in doc: for ent in doc.ents:
if token._.is_country: if ent._.is_country:
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag) print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
print("Entities", [(e.text, e.label_) for e in doc.ents])
``` ```
In this case, all data can be fetched on initialization in one request. However, In this case, all data can be fetched on initialization in one request. However,