spaCy/spacy/tests/regression/test_issue4001-4500.py

448 lines
16 KiB
Python
Raw Normal View History

2020-07-06 15:05:59 +03:00
import pytest
from spacy.pipeline import TrainablePipe
2020-07-06 15:05:59 +03:00
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, DocBin
from spacy.training import Example, Corpus
2020-09-22 12:50:19 +03:00
from spacy.training.converters import json_to_docs
2020-07-06 15:05:59 +03:00
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.util import minibatch, ensure_path, load_model
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
from spacy.tokenizer import Tokenizer
from spacy.lang.el import Greek
from spacy.language import Language
import spacy
from thinc.api import compounding
from ..util import make_tempdir
@pytest.mark.issue(4002)
2020-07-06 15:05:59 +03:00
def test_issue4002(en_vocab):
2020-09-29 22:39:28 +03:00
"""Test that the PhraseMatcher can match on overwritten NORM attributes."""
2020-07-06 15:05:59 +03:00
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc)
assert len(matches) == 1
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern2 = Doc(en_vocab, words=["1", "2"])
pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"]
matcher.add("TEST", [pattern2])
matches = matcher(doc)
assert len(matches) == 1
@pytest.mark.issue(4030)
2020-07-06 15:05:59 +03:00
def test_issue4030():
2021-07-02 10:48:26 +03:00
"""Test whether textcat works fine with empty doc"""
2020-07-06 15:05:59 +03:00
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
model = {
"@architectures": "spacy.TextCatBOW.v1",
"exclusive_classes": True,
"ngram_size": 2,
"no_output_layer": False,
}
textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
2020-07-06 15:05:59 +03:00
for label in unique_classes:
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
2020-09-28 22:35:09 +03:00
optimizer = nlp.initialize()
2020-07-06 15:05:59 +03:00
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
2020-09-29 22:39:28 +03:00
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
2020-07-06 15:05:59 +03:00
# processing of an empty doc should result in 0.0 for all categories
doc = nlp("")
assert doc.cats["offensive"] == 0.0
assert doc.cats["inoffensive"] == 0.0
@pytest.mark.issue(4042)
2020-07-06 15:05:59 +03:00
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
# add ner pipe
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
ner = nlp.add_pipe("ner")
2020-07-06 15:05:59 +03:00
ner.add_label("SOME_LABEL")
2020-09-28 22:35:09 +03:00
nlp.initialize()
2020-07-06 15:05:59 +03:00
# Add entity ruler
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
]
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
# works fine with "after"
ruler = nlp.add_pipe("entity_ruler", before="ner")
2020-07-06 15:05:59 +03:00
ruler.add_patterns(patterns)
doc1 = nlp("What do you think about Apple ?")
assert doc1.ents[0].label_ == "MY_ORG"
with make_tempdir() as d:
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
nlp2 = load_model(output_dir)
doc2 = nlp2("What do you think about Apple ?")
assert doc2.ents[0].label_ == "MY_ORG"
@pytest.mark.issue(4042)
2020-07-06 15:05:59 +03:00
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
This is the second bug of two bugs underlying the issue 4042.
"""
nlp1 = English()
# add ner pipe
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
ner1 = nlp1.add_pipe("ner")
2020-07-06 15:05:59 +03:00
ner1.add_label("SOME_LABEL")
2020-09-28 22:35:09 +03:00
nlp1.initialize()
2020-07-06 15:05:59 +03:00
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
assert "SOME_LABEL" in ner1.labels
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
doc1.ents = list(doc1.ents) + [apple_ent]
Add beam_parser and beam_ner components for v3 (#6369) * Get basic beam tests working * Get basic beam tests working * Compile _beam_utils * Remove prints * Test beam density * Beam parser seems to train * Draft beam NER * Upd beam * Add hypothesis as dev dependency * Implement missing is-gold-parse method * Implement early update * Fix state hashing * Fix test * Fix test * Default to non-beam in parser constructor * Improve oracle for beam * Start refactoring beam * Update test * Refactor beam * Update nn * Refactor beam and weight by cost * Update ner beam settings * Update test * Add __init__.pxd * Upd test * Fix test * Upd test * Fix test * Remove ring buffer history from StateC * WIP change arc-eager transitions * Add state tests * Support ternary sent start values * Fix arc eager * Fix NER * Pass oracle cut size for beam * Fix ner test * Fix beam * Improve StateC.clone * Improve StateClass.borrow * Work directly with StateC, not StateClass * Remove print statements * Fix state copy * Improve state class * Refactor parser oracles * Fix arc eager oracle * Fix arc eager oracle * Use a vector to implement the stack * Refactor state data structure * Fix alignment of sent start * Add get_aligned_sent_starts method * Add test for ae oracle when bad sentence starts * Fix sentence segment handling * Avoid Reduce that inserts illegal sentence * Update preset SBD test * Fix test * Remove prints * Fix sent starts in Example * Improve python API of StateClass * Tweak comments and debug output of arc eager * Upd test * Fix state test * Fix state test
2020-12-13 04:08:32 +03:00
# Add the label explicitly. Previously we didn't require this.
ner1.add_label("MY_ORG")
2020-07-06 15:05:59 +03:00
ner1(doc1)
assert len(ner1.labels) == 2
assert "SOME_LABEL" in ner1.labels
assert "MY_ORG" in ner1.labels
with make_tempdir() as d:
# assert IO goes fine
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
2020-08-09 23:36:23 +03:00
config = {}
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
ner2 = nlp1.create_pipe("ner", config=config)
2020-07-06 15:05:59 +03:00
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
@pytest.mark.issue(4054)
2020-07-06 15:05:59 +03:00
def test_issue4054(en_vocab):
"""Test that a new blank model can be made with a vocab from file,
and that serialization does not drop the language at any point."""
nlp1 = English()
vocab1 = nlp1.vocab
with make_tempdir() as d:
vocab_dir = ensure_path(d / "vocab")
if not vocab_dir.exists():
vocab_dir.mkdir()
vocab1.to_disk(vocab_dir)
vocab2 = Vocab().from_disk(vocab_dir)
nlp2 = spacy.blank("en", vocab=vocab2)
nlp_dir = ensure_path(d / "nlp")
if not nlp_dir.exists():
nlp_dir.mkdir()
nlp2.to_disk(nlp_dir)
nlp3 = load_model(nlp_dir)
assert nlp3.lang == "en"
@pytest.mark.issue(4120)
2020-07-06 15:05:59 +03:00
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works
doc2 = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc3)) == 2 # works
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # fixed
@pytest.mark.issue(4133)
2020-07-06 15:05:59 +03:00
def test_issue4133(en_vocab):
nlp = English()
vocab_bytes = nlp.vocab.to_bytes()
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
doc = Doc(en_vocab, words=words)
for i, token in enumerate(doc):
token.pos_ = pos[i]
# usually this is already True when starting from proper models instead of blank English
doc_bytes = doc.to_bytes()
vocab = Vocab()
vocab = vocab.from_bytes(vocab_bytes)
doc = Doc(vocab).from_bytes(doc_bytes)
actual = []
for token in doc:
actual.append(token.pos_)
assert actual == pos
@pytest.mark.issue(4190)
2020-07-06 15:05:59 +03:00
def test_issue4190():
def customize_tokenizer(nlp):
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = compile_infix_regex(nlp.Defaults.infixes)
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = {
k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
if not (len(k) == 2 and k[1] == ".")
}
new_tokenizer = Tokenizer(
nlp.vocab,
exceptions,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
)
nlp.tokenizer = new_tokenizer
test_string = "Test c."
# Load default language
nlp_1 = English()
doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a] # noqa: F841
# Modify tokenizer
customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b]
# Save and Reload
with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir)
nlp_2 = load_model(model_dir)
# This should be the modified tokenizer
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
@pytest.mark.issue(4267)
2020-07-06 15:05:59 +03:00
def test_issue4267():
2021-07-02 10:48:26 +03:00
"""Test that running an entity_ruler after ner gives consistent results"""
2020-07-06 15:05:59 +03:00
nlp = English()
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
ner = nlp.add_pipe("ner")
2020-07-06 15:05:59 +03:00
ner.add_label("PEOPLE")
2020-09-28 22:35:09 +03:00
nlp.initialize()
2020-07-06 15:05:59 +03:00
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
assert doc1.has_annotation("ENT_IOB")
2020-07-06 15:05:59 +03:00
for token in doc1:
assert token.ent_iob == 2
# add entity ruler and run again
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
ruler = nlp.add_pipe("entity_ruler")
2020-07-06 15:05:59 +03:00
ruler.add_patterns(patterns)
assert "entity_ruler" in nlp.pipe_names
assert "ner" in nlp.pipe_names
# assert that we still have correct IOB annotations
doc2 = nlp("hi")
assert doc2.has_annotation("ENT_IOB")
2020-07-06 15:05:59 +03:00
for token in doc2:
assert token.ent_iob == 2
Add Lemmatizer and simplify related components (#5848) * Add Lemmatizer and simplify related components * Add `Lemmatizer` pipe with `lookup` and `rule` modes using the `Lookups` tables. * Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma) * Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer, or morph rules) * Remove lemmatizer from `Vocab` * Adjust many many tests Differences: * No default lookup lemmas * No special treatment of TAG in `from_array` and similar required * Easier to modify labels in a `Tagger` * No extra strings added from morphology / tag map * Fix test * Initial fix for Lemmatizer config/serialization * Adjust init test to be more generic * Adjust init test to force empty Lookups * Add simple cache to rule-based lemmatizer * Convert language-specific lemmatizers Convert language-specific lemmatizers to component lemmatizers. Remove previous lemmatizer class. * Fix French and Polish lemmatizers * Remove outdated UPOS conversions * Update Russian lemmatizer init in tests * Add minimal init/run tests for custom lemmatizers * Add option to overwrite existing lemmas * Update mode setting, lookup loading, and caching * Make `mode` an immutable property * Only enforce strict `load_lookups` for known supported modes * Move caching into individual `_lemmatize` methods * Implement strict when lang is not found in lookups * Fix tables/lookups in make_lemmatizer * Reallow provided lookups and allow for stricter checks * Add lookups asset to all Lemmatizer pipe tests * Rename lookups in lemmatizer init test * Clean up merge * Refactor lookup table loading * Add helper from `load_lemmatizer_lookups` that loads required and optional lookups tables based on settings provided by a config. Additional slight refactor of lookups: * Add `Lookups.set_table` to set a table from a provided `Table` * Reorder class definitions to be able to specify type as `Table` * Move registry assets into test methods * Refactor lookups tables config Use class methods within `Lemmatizer` to provide the config for particular modes and to load the lookups from a config. * Add pipe and score to lemmatizer * Simplify Tagger.score * Add missing import * Clean up imports and auto-format * Remove unused kwarg * Tidy up and auto-format * Update docstrings for Lemmatizer Update docstrings for Lemmatizer. Additionally modify `is_base_form` API to take `Token` instead of individual features. * Update docstrings * Remove tag map values from Tagger.add_label * Update API docs * Fix relative link in Lemmatizer API docs
2020-08-07 16:27:13 +03:00
@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab")
@pytest.mark.issue(4272)
2020-07-06 15:05:59 +03:00
def test_issue4272():
"""Test that lookup table can be accessed from Token.lemma if no POS tags
are available."""
nlp = Greek()
doc = nlp("Χθες")
assert doc[0].lemma_
def test_multiple_predictions():
class DummyPipe(TrainablePipe):
2020-07-06 15:05:59 +03:00
def __init__(self):
self.model = "dummy_model"
def predict(self, docs):
return ([1, 2, 3], [4, 5, 6])
def set_annotations(self, docs, scores):
2020-07-06 15:05:59 +03:00
return docs
nlp = Language()
doc = nlp.make_doc("foo")
dummy_pipe = DummyPipe()
dummy_pipe(doc)
@pytest.mark.issue(4313)
2020-07-06 15:05:59 +03:00
def test_issue4313():
2021-07-02 10:48:26 +03:00
"""This should not crash or exit with some strange error code"""
2020-07-06 15:05:59 +03:00
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
"beam_width": beam_width,
"beam_density": beam_density,
}
ner = nlp.add_pipe("beam_ner", config=config)
2020-07-06 15:05:59 +03:00
ner.add_label("SOME_LABEL")
nlp.initialize()
2020-07-06 15:05:59 +03:00
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
assert "SOME_LABEL" in ner.labels
apple_ent = Span(doc, 5, 6, label="MY_ORG")
doc.ents = list(doc.ents) + [apple_ent]
# ensure the beam_parse still works with the new label
docs = [doc]
2021-01-15 03:57:36 +03:00
ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density)
assert len(ner.labels) == 2
assert "MY_ORG" in ner.labels
2020-07-06 15:05:59 +03:00
@pytest.mark.issue(4348)
2020-07-06 15:05:59 +03:00
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
nlp = English()
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
TRAIN_DATA = [example, example]
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
2020-09-28 22:35:09 +03:00
optimizer = nlp.initialize()
2020-07-06 15:05:59 +03:00
for i in range(5):
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)
@pytest.mark.issue(4367)
2020-07-06 15:05:59 +03:00
def test_issue4367():
"""Test that docbin init goes well"""
DocBin()
DocBin(attrs=["LEMMA"])
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
@pytest.mark.issue(4373)
2020-07-06 15:05:59 +03:00
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
matcher = PhraseMatcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
@pytest.mark.issue(4402)
2020-07-06 15:05:59 +03:00
def test_issue4402():
json_data = {
"id": 0,
"paragraphs": [
{
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "How", "ner": "O"},
{"id": 1, "orth": "should", "ner": "O"},
{"id": 2, "orth": "I", "ner": "O"},
{"id": 3, "orth": "cook", "ner": "O"},
{"id": 4, "orth": "bacon", "ner": "O"},
{"id": 5, "orth": "in", "ner": "O"},
{"id": 6, "orth": "an", "ner": "O"},
{"id": 7, "orth": "oven", "ner": "O"},
{"id": 8, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{
"tokens": [
{"id": 9, "orth": "\n", "ner": "O"},
{"id": 10, "orth": "I", "ner": "O"},
{"id": 11, "orth": "'ve", "ner": "O"},
{"id": 12, "orth": "heard", "ner": "O"},
{"id": 13, "orth": "of", "ner": "O"},
{"id": 14, "orth": "people", "ner": "O"},
{"id": 15, "orth": "cooking", "ner": "O"},
{"id": 16, "orth": "bacon", "ner": "O"},
{"id": 17, "orth": "in", "ner": "O"},
{"id": 18, "orth": "an", "ner": "O"},
{"id": 19, "orth": "oven", "ner": "O"},
{"id": 20, "orth": ".", "ner": "O"},
],
"brackets": [],
},
],
"cats": [
{"label": "baking", "value": 1.0},
{"label": "not_baking", "value": 0.0},
],
},
{
"raw": "What is the difference between white and brown eggs?\n",
"sentences": [
{
"tokens": [
{"id": 0, "orth": "What", "ner": "O"},
{"id": 1, "orth": "is", "ner": "O"},
{"id": 2, "orth": "the", "ner": "O"},
{"id": 3, "orth": "difference", "ner": "O"},
{"id": 4, "orth": "between", "ner": "O"},
{"id": 5, "orth": "white", "ner": "O"},
{"id": 6, "orth": "and", "ner": "O"},
{"id": 7, "orth": "brown", "ner": "O"},
{"id": 8, "orth": "eggs", "ner": "O"},
{"id": 9, "orth": "?", "ner": "O"},
],
"brackets": [],
},
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
],
"cats": [
{"label": "baking", "value": 0.0},
{"label": "not_baking", "value": 1.0},
],
},
],
}
nlp = English()
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy"
2020-09-22 12:50:19 +03:00
docs = json_to_docs([json_data])
2020-07-06 15:05:59 +03:00
data = DocBin(docs=docs, attrs=attrs).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)
reader = Corpus(output_file)
train_data = list(reader(nlp))
2020-07-06 15:05:59 +03:00
assert len(train_data) == 2
split_train_data = []
for eg in train_data:
split_train_data.extend(eg.split_sents())
assert len(split_train_data) == 4