spaCy/spacy/tests/regression/test_issue3001-3500.py

import pytest
from spacy import registry
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.pipeline.ner import DEFAULT_NER_MODEL
from spacy.pipeline import EntityRuler, EntityRecognizer
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.attrs import ENT_IOB, ENT_TYPE
from spacy.compat import pickle
from spacy import displacy
from spacy.vectors import Vectors
import numpy


def test_issue3002():
    """Test that the tokenizer doesn't hang on a long list of dots"""
    nlp = German()
    doc = nlp(
        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
    )
    assert len(doc) == 5


def test_issue3009(en_vocab):
    """Test problem with matcher quantifiers"""
    patterns = [
        [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
        [
            {"ORTH": "has"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
        [
            {"ORTH": "has"},
            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
            {"LOWER": "to"},
            {"LOWER": "do"},
            {"TAG": "IN"},
        ],
    ]
    words = ["also", "has", "to", "do", "with"]
    tags = ["RB", "VBZ", "TO", "VB", "IN"]
    pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
    matcher = Matcher(en_vocab)
    for i, pattern in enumerate(patterns):
        matcher.add(str(i), [pattern])
        matches = matcher(doc)
        assert matches


def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = [("PERCENT", 2, 4)]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)
    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected


def test_issue3199():
    """Test that Span.noun_chunks works correctly if no noun chunks iterator
    is available. To make this test future-proof, we're constructing a Doc
    with a new Vocab here and a parse tree to make sure the noun chunks run.
    """
    words = ["This", "is", "a", "sentence"]
    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
    assert list(doc[0:3].noun_chunks) == []


def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.add_pipe("ner")
    ner.add_label("ANIMAL")
    nlp.initialize()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    ner2 = nlp2.add_pipe("ner")
    model = ner2.model
    model.attrs["resize_output"](model, ner.moves.n_moves)
    nlp2.from_bytes(nlp.to_bytes())
    assert ner2.move_names == move_names


def test_issue3248_1():
    """Test that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    assert len(matcher) == 2


def test_issue3248_2():
    """Test that the PhraseMatcher can be pickled correctly."""
    nlp = English()
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
    matcher.add("TEST2", [nlp("d")])
    data = pickle.dumps(matcher)
    new_matcher = pickle.loads(data)
    assert len(new_matcher) == len(matcher)


def test_issue3277(es_tokenizer):
    """Test that hyphens are split correctly as prefixes."""
    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
    assert len(doc) == 14
    assert doc[0].text == "\u2014"
    assert doc[5].text == "\u2013"
    assert doc[9].text == "\u2013"


def test_issue3288(en_vocab):
    """Test that retokenization works correctly via displaCy when punctuation
    is merged onto the preceeding token and tensor is resized."""
    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
    heads = [1, 1, 1, 4, 4, 6, 4, 4]
    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
    displacy.render(doc)


def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe("textcat")
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe("textcat")
    new_nlp.from_bytes(bytes_data)


def test_issue3328(en_vocab):
    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
    matcher = Matcher(en_vocab)
    patterns = [
        [{"LOWER": {"IN": ["hello", "how"]}}],
        [{"LOWER": {"IN": ["you", "doing"]}}],
    ]
    matcher.add("TEST", patterns)
    matches = matcher(doc)
    assert len(matches) == 4
    matched_texts = [doc[start:end].text for _, start, end in matches]
    assert matched_texts == ["Hello", "how", "you", "doing"]


def test_issue3331(en_vocab):
    """Test that duplicate patterns for different rules result in multiple
    matches, one per rule.
    """
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
    matches = matcher(doc)
    assert len(matches) == 2
    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
    assert sorted(match_ids) == ["A", "B"]


def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(doc.vocab, model, **config)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")


def test_issue3412():
    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
    vectors = Vectors(data=data, keys=["A", "B", "C"])
    keys, best_rows, scores = vectors.most_similar(
        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
    )
    assert best_rows[0] == 2


@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
def test_issue3449():
    nlp = English()
    nlp.add_pipe("sentencizer")
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"


def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
    tagger = nlp.add_pipe("tagger")
    tagger.add_label("A")
    nlp.initialize()
    list(nlp.pipe(["hi", ""]))


def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.has_annotation("SENT_START")
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.has_annotation("SENT_START")
    assert len(list(new_doc.sents)) == 1
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								import pytest
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								from spacy import registry
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								from spacy.lang.en import English
 								from spacy.lang.de import German
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								from spacy.pipeline.ner import DEFAULT_NER_MODEL
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								from spacy.pipeline import EntityRuler, EntityRecognizer
 								from spacy.matcher import Matcher, PhraseMatcher
 								from spacy.tokens import Doc
 								from spacy.vocab import Vocab
 								from spacy.attrs import ENT_IOB, ENT_TYPE
-												Drop Python 2.7 and 3.5 (#4828)

* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]

											
										
										
											2019-12-22 03:53:56 +03:00
+								from spacy.compat import pickle
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								from spacy import displacy
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 13:04:46 +03:00
+								from spacy.vectors import Vectors
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								import numpy
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
 								def test_issue3002():
 								    """Test that the tokenizer doesn't hang on a long list of dots"""
 								    nlp = German()
 								    doc = nlp(
 								        "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl"
 								    )
 								    assert len(doc) == 5
 								def test_issue3009(en_vocab):
 								    """Test problem with matcher quantifiers"""
 								    patterns = [
-												Remove corpus-specific tag maps

Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.

											
										
										
											2020-07-15 15:13:58 +03:00
+								        [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								        [
-												Remove corpus-specific tag maps

Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.

											
										
										
											2020-07-15 15:13:58 +03:00
+								            {"ORTH": "has"},
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 22:54:26 +03:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								        ],
 								        [
-												Remove corpus-specific tag maps

Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.

											
										
										
											2020-07-15 15:13:58 +03:00
+								            {"ORTH": "has"},
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								            {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
 								            {"LOWER": "to"},
 								            {"LOWER": "do"},
-												Make regression test less sensitive to tag-map stuff

											
										
										
											2019-08-25 22:54:26 +03:00
+								            {"TAG": "IN"},
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								        ],
 								    ]
 								    words = ["also", "has", "to", "do", "with"]
 								    tags = ["RB", "VBZ", "TO", "VB", "IN"]
-												Remove corpus-specific tag maps

Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.

											
										
										
											2020-07-15 15:13:58 +03:00
+								    pos = ["ADV", "VERB", "ADP", "VERB", "ADP"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=words, tags=tags, pos=pos)
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    matcher = Matcher(en_vocab)
 								    for i, pattern in enumerate(patterns):
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 23:21:08 +03:00
+								        matcher.add(str(i), [pattern])
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								        matches = matcher(doc)
 								        assert matches
 								def test_issue3012(en_vocab):
 								    """Test that the is_tagged attribute doesn't get overwritten when we from_array
 								    without tag information."""
 								    words = ["This", "is", "10", "%", "."]
 								    tags = ["DT", "VBZ", "CD", "NN", "."]
 								    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    ents = [("PERCENT", 2, 4)]
 								    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								    assert doc.has_annotation("TAG")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    expected = ("10", "NUM", "CD", "PERCENT")
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    header = [ENT_IOB, ENT_TYPE]
 								    ent_array = doc.to_array(header)
 								    doc.from_array(header, ent_array)
 								    assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected
 								    # Serializing then deserializing
 								    doc_bytes = doc.to_bytes()
 								    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
 								    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
 								def test_issue3199():
 								    """Test that Span.noun_chunks works correctly if no noun chunks iterator
 								    is available. To make this test future-proof, we're constructing a Doc
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								    with a new Vocab here and a parse tree to make sure the noun chunks run.
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    """
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    words = ["This", "is", "a", "sentence"]
 								    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    assert list(doc[0:3].noun_chunks) == []
 								def test_issue3209():
 								    """Test issue that occurred in spaCy nightly where NER labels were being
 								    mapped to classes incorrectly after loading the model, when the labels
 								    were added using ner.add_label().
 								    """
 								    nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ner = nlp.add_pipe("ner")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    ner.add_label("ANIMAL")
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
+								    nlp.initialize()
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
 								    assert ner.move_names == move_names
 								    nlp2 = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ner2 = nlp2.add_pipe("ner")
 								    model = ner2.model
-												Adapt parser and NER for transformers (#5449)

* Draft layer for BILUO actions

* Fixes to biluo layer

* WIP on BILUO layer

* Add tests for BILUO layer

* Format

* Fix transitions

* Update test

* Link in the simple_ner

* Update BILUO tagger

* Update __init__

* Import simple_ner

* Update test

* Import

* Add files

* Add config

* Fix label passing for BILUO and tagger

* Fix label handling for simple_ner component

* Update simple NER test

* Update config

* Hack train script

* Update BILUO layer

* Fix SimpleNER component

* Update train_from_config

* Add biluo_to_iob helper

* Add IOB layer

* Add IOBTagger model

* Update biluo layer

* Update SimpleNER tagger

* Update BILUO

* Read random seed in train-from-config

* Update use of normal_init

* Fix normalization of gradient in SimpleNER

* Update IOBTagger

* Remove print

* Tweak masking in BILUO

* Add dropout in SimpleNER

* Update thinc

* Tidy up simple_ner

* Fix biluo model

* Unhack train-from-config

* Update setup.cfg and requirements

* Add tb_framework.py for parser model

* Try to avoid memory leak in BILUO

* Move ParserModel into spacy.ml, avoid need for subclass.

* Use updated parser model

* Remove incorrect call to model.initializre in PrecomputableAffine

* Update parser model

* Avoid divide by zero in tagger

* Add extra dropout layer in tagger

* Refine minibatch_by_words function to avoid oom

* Fix parser model after refactor

* Try to avoid div-by-zero in SimpleNER

* Fix infinite loop in minibatch_by_words

* Use SequenceCategoricalCrossentropy in Tagger

* Fix parser model when hidden layer

* Remove extra dropout from tagger

* Add extra nan check in tagger

* Fix thinc version

* Update tests and imports

* Fix test

* Update test

* Update tests

* Fix tests

* Fix test

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-05-18 23:23:33 +03:00
+								    model.attrs["resize_output"](model, ner.moves.n_moves)
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    nlp2.from_bytes(nlp.to_bytes())
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    assert ner2.move_names == move_names
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
 								def test_issue3248_1():
 								    """Test that the PhraseMatcher correctly reports its number of rules, not
 								    total number of patterns."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 23:21:08 +03:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    assert len(matcher) == 2
 								def test_issue3248_2():
 								    """Test that the PhraseMatcher can be pickled correctly."""
 								    nlp = English()
 								    matcher = PhraseMatcher(nlp.vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 23:21:08 +03:00
+								    matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
 								    matcher.add("TEST2", [nlp("d")])
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    data = pickle.dumps(matcher)
 								    new_matcher = pickle.loads(data)
 								    assert len(new_matcher) == len(matcher)
 								def test_issue3277(es_tokenizer):
 								    """Test that hyphens are split correctly as prefixes."""
 								    doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
 								    assert len(doc) == 14
 								    assert doc[0].text == "\u2014"
 								    assert doc[5].text == "\u2013"
 								    assert doc[9].text == "\u2013"
 								def test_issue3288(en_vocab):
 								    """Test that retokenization works correctly via displaCy when punctuation
 								    is merged onto the preceeding token and tensor is resized."""
 								    words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    heads = [1, 1, 1, 4, 4, 6, 4, 4]
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"]
-												Tidy up tests and docs

											
										
										
											2020-09-21 21:43:54 +03:00
+								    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    doc.tensor = numpy.zeros((len(words), 96), dtype="float32")
 								    displacy.render(doc)
 								def test_issue3289():
 								    """Test that Language.to_bytes handles serializing a pipeline component
 								    with an uninitialized model."""
 								    nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    nlp.add_pipe("textcat")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    bytes_data = nlp.to_bytes()
 								    new_nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    new_nlp.add_pipe("textcat")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    new_nlp.from_bytes(bytes_data)
 								def test_issue3328(en_vocab):
 								    doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"])
 								    matcher = Matcher(en_vocab)
 								    patterns = [
 								        [{"LOWER": {"IN": ["hello", "how"]}}],
 								        [{"LOWER": {"IN": ["you", "doing"]}}],
 								    ]
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 23:21:08 +03:00
+								    matcher.add("TEST", patterns)
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    matches = matcher(doc)
 								    assert len(matches) == 4
 								    matched_texts = [doc[start:end].text for _, start, end in matches]
 								    assert matched_texts == ["Hello", "how", "you", "doing"]
 								def test_issue3331(en_vocab):
 								    """Test that duplicate patterns for different rules result in multiple
 								    matches, one per rule.
 								    """
 								    matcher = PhraseMatcher(en_vocab)
-												Implement new API for {Phrase}Matcher.add (backwards-compatible) (#4522)

* Implement new API for {Phrase}Matcher.add (backwards-compatible)

* Update docs

* Also update DependencyMatcher.add

* Update internals

* Rewrite tests to use new API

* Add basic check for common mistake

Raise error with suggestion if user likely passed in a pattern instead of a list of patterns

* Fix typo [ci skip]

											
										
										
											2019-10-25 23:21:08 +03:00
+								    matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
 								    matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
 								    matches = matcher(doc)
 								    assert len(matches) == 2
 								    match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]]
 								    assert sorted(match_ids) == ["A", "B"]
 								def test_issue3345():
 								    """Test case where preset entity crosses sentence boundary."""
 								    nlp = English()
 								    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
 								    doc[4].is_sent_start = True
 								    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
-												Tidy up and auto-format

											
										
										
											2020-06-20 15:15:04 +03:00
+								    config = {
 								        "learn_tokens": False,
 								        "min_action_freq": 30,
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								        "update_with_oracle_cut_size": 100,
-												Tidy up and auto-format

											
										
										
											2020-06-20 15:15:04 +03:00
+								    }
-												Tidy up, autoformat, add types

											
										
										
											2020-07-25 16:01:15 +03:00
+								    cfg = {"model": DEFAULT_NER_MODEL}
-												Update config resolution to use new Thinc

											
										
										
											2020-09-27 23:21:31 +03:00
+								    model = registry.resolve(cfg, validate=True)["model"]
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    ner = EntityRecognizer(doc.vocab, model, **config)
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    # Add the OUT action. I wouldn't have thought this would be necessary...
 								    ner.moves.add_action(5, "")
 								    ner.add_label("GPE")
 								    doc = ruler(doc)
 								    # Get into the state just before "New"
 								    state = ner.moves.init_batch([doc])[0]
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    ner.moves.apply_transition(state, "O")
 								    # Check that B-GPE is valid.
 								    assert ner.moves.is_valid(state, "B-GPE")
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 13:04:46 +03:00
+								def test_issue3412():
 								    data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
-												Fix most_similar for vectors with unused rows (#5348)

* Fix most_similar for vectors with unused rows

Address issues related to the unused rows in the vector table and
`most_similar`:

* Update `most_similar()` to search only through rows that are in use
according to `key2row`.

* Raise an error when `most_similar(n=n)` is larger than the number of
vectors in the table.

* Set and restore `_unset` correctly when vectors are added or
deserialized so that new vectors are added in the correct row.

* Set data and keys to the same length in `Vocab.prune_vectors()` to
avoid spurious entries in `key2row`.

* Fix regression test using `most_similar`

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 17:41:26 +03:00
+								    vectors = Vectors(data=data, keys=["A", "B", "C"])
-												Auto-format [ci skip]

											
										
										
											2019-10-24 17:21:08 +03:00
+								    keys, best_rows, scores = vectors.most_similar(
 								        numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
 								    )
 								    assert best_rows[0] == 2
-												prevent division by zero in most_similar method (#4488)


											
										
										
											2019-10-21 13:04:46 +03:00
-												Test suite clean up (#5781)

* step_through tests: skip instead of xfail

* test_empty_doc should be fixed with new Thinc version

* remove outdated test (there are other misaligned tests now)

* xfail reason

* fix test according to french exceptions

* clarified some skipped tests

* skip ukranian test instead of xfail

* skip instead of xfail

* skip + reason instead of xfail

* removed obsolete tests referring to removed "set_frozen" functionality

* fix test 999

* remove unused AlignmentError

* remove xfail where possible, skip otherwise

* increment thinc release for empty_doc test
											
										
										
											2020-07-20 15:49:54 +03:00
+								@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								def test_issue3449():
 								    nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    nlp.add_pipe("sentencizer")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
 								    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
 								    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
 								    t1 = nlp(text1)
 								    t2 = nlp(text2)
 								    t3 = nlp(text3)
 								    assert t1[5].text == "I"
 								    assert t2[5].text == "I"
 								    assert t3[5].text == "I"
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
+								def test_issue3456():
 								    # this crashed because of a padding error in layer.ops.unflatten in thinc
 								    nlp = English()
-												Add Lemmatizer and simplify related components (#5848)

* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
											
										
										
											2020-08-07 16:27:13 +03:00
+								    tagger = nlp.add_pipe("tagger")
 								    tagger.add_label("A")
-												begin_training -> initialize

											
										
										
											2020-09-28 22:35:09 +03:00
+								    nlp.initialize()
-												Tidy up and auto-format

											
										
										
											2019-10-18 12:27:38 +03:00
+								    list(nlp.pipe(["hi", ""]))
-												Ensure training doesn't crash with empty batches (#4360)

* unit test for previously resolved unflatten issue

* prevent batch of empty docs to cause problems

											
										
										
											2019-10-02 13:50:48 +03:00
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								def test_issue3468():
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								    """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    be restored after serialization."""
 								    nlp = English()
-												Refactor pipeline components, config and language data (#5759)

* Update with WIP

* Update with WIP

* Update with pipeline serialization

* Update types and pipe factories

* Add deep merge, tidy up and add tests

* Fix pipe creation from config

* Don't validate default configs on load

* Update spacy/language.py

Co-authored-by: Ines Montani <ines@ines.io>

* Adjust factory/component meta error

* Clean up factory args and remove defaults

* Add test for failing empty dict defaults

* Update pipeline handling and methods

* provide KB as registry function instead of as object

* small change in test to make functionality more clear

* update example script for EL configuration

* Fix typo

* Simplify test

* Simplify test

* splitting pipes.pyx into separate files

* moving default configs to each component file

* fix batch_size type

* removing default values from component constructors where possible (TODO: test 4725)

* skip instead of xfail

* Add test for config -> nlp with multiple instances

* pipeline.pipes -> pipeline.pipe

* Tidy up, document, remove kwargs

* small cleanup/generalization for Tok2VecListener

* use DEFAULT_UPSTREAM field

* revert to avoid circular imports

* Fix tests

* Replace deprecated arg

* Make model dirs require config

* fix pickling of keyword-only arguments in constructor

* WIP: clean up and integrate full config

* Add helper to handle function args more reliably

Now also includes keyword-only args

* Fix config composition and serialization

* Improve config debugging and add visual diff

* Remove unused defaults and fix type

* Remove pipeline and factories from meta

* Update spacy/default_config.cfg

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/default_config.cfg

* small UX edits

* avoid printing stack trace for debug CLI commands

* Add support for language-specific factories

* specify the section of the config which holds the model to debug

* WIP: add Language.from_config

* Update with language data refactor WIP

* Auto-format

* Add backwards-compat handling for Language.factories

* Update morphologizer.pyx

* Fix morphologizer

* Update and simplify lemmatizers

* Fix Japanese tests

* Port over tagger changes

* Fix Chinese and tests

* Update to latest Thinc

* WIP: xfail first Russian lemmatizer test

* Fix component-specific overrides

* fix nO for output layers in debug_model

* Fix default value

* Fix tests and don't pass objects in config

* Fix deep merging

* Fix lemma lookup data registry

Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed)

* Add types

* Add Vocab.from_config

* Fix typo

* Fix tests

* Make config copying more elegant

* Fix pipe analysis

* Fix lemmatizers and is_base_form

* WIP: move language defaults to config

* Fix morphology type

* Fix vocab

* Remove comment

* Update to latest Thinc

* Add morph rules to config

* Tidy up

* Remove set_morphology option from tagger factory

* Hack use_gpu

* Move [pipeline] to top-level block and make [nlp.pipeline] list

Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them

* Fix use_gpu and resume in CLI

* Auto-format

* Remove resume from config

* Fix formatting and error

* [pipeline] -> [components]

* Fix types

* Fix tagger test: requires set_morphology?

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-07-22 14:42:59 +03:00
+								    nlp.add_pipe("sentencizer")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    doc = nlp("Hello world")
 								    assert doc[0].is_sent_start
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								    assert doc.has_annotation("SENT_START")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    assert len(list(doc.sents)) == 1
 								    doc_bytes = doc.to_bytes()
 								    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
 								    assert new_doc[0].is_sent_start
-												Refactor Docs.is_ flags (#6044)

* Refactor Docs.is_ flags

* Add derived `Doc.has_annotation` method

  * `Doc.has_annotation(attr)` returns `True` for partial annotation

  * `Doc.has_annotation(attr, require_complete=True)` returns `True` for
    complete annotation

* Add deprecation warnings to `is_tagged`, `is_parsed`, `is_sentenced`
and `is_nered`

* Add `Doc._get_array_attrs()`, which returns a full list of `Doc` attrs
for use with `Doc.to_array`, `Doc.to_bytes` and `Doc.from_docs`. The
list is the `DocBin` attributes list plus `SPACY` and `LENGTH`.

Notes on `Doc.has_annotation`:

* `HEAD` is converted to `DEP` because heads don't have an unset state

* Accept `IS_SENT_START` as a synonym of `SENT_START`

Additional changes:

* Add `NORM`, `ENT_ID` and `SENT_START` to default attributes for
`DocBin`

* In `Doc.from_array()` the presence of `DEP` causes `HEAD` to override
`SENT_START`

* In `Doc.from_array()` using `attrs` other than
`Doc._get_array_attrs()` (i.e., a user's custom list rather than our
default internal list) with both `HEAD` and `SENT_START` shows a warning
that `HEAD` will override `SENT_START`

* `set_children_from_heads` does not require dependency labels to set
sentence boundaries and sets `sent_start` for all non-sentence starts to
`-1`

* Fix call to set_children_form_heads

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-09-17 01:14:01 +03:00
+								    assert new_doc.has_annotation("SENT_START")
-												Merge regression tests

											
										
										
											2019-07-10 13:49:18 +03:00
+								    assert len(list(new_doc.sents)) == 1