Merge branch 'master' into spacy.io

2025-11-07 03:17:37 +03:00 · 2021-02-14 14:39:46 +11:00 · 2021-02-14 14:39:46 +11:00 · 0c7937c74d
commit 0c7937c74d
parent 3246cf8b2b 4b729660bd
6 changed files with 101 additions and 4 deletions
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@ -103,6 +103,10 @@ def fill_config(
    # config result is a valid config
    nlp = util.load_model_from_config(nlp.config)
    filled = nlp.config
    # If we have sourced components in the base config, those will have been
    # replaced with their actual config after loading, so we have to re-add them
    sourced = util.get_sourced_components(config)
    filled["components"].update(sourced)
    if pretraining:
        validate_config_for_pretrain(filled, msg)
        pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -278,7 +278,7 @@ cdef cppclass StateC:
        return this._stack.size()
    int buffer_length() nogil const:
-        return this.length - this._b_i
+        return (this.length - this._b_i) + this._rebuffer.size()
    void push() nogil:
        b0 = this.B(0)
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@ -134,8 +134,6 @@ cdef class TransitionSystem:
    def is_valid(self, StateClass stcls, move_name):
        action = self.lookup_transition(move_name)
        if action.move == 0:
            return False
        return action.is_valid(stcls.c, action.label)
    cdef int set_valid(self, int* is_valid, const StateC* st) nogil:
--- a/spacy/tests/regression/test_issue7055.py
+++ b/spacy/tests/regression/test_issue7055.py
@ -0,0 +1,40 @@
 from spacy.cli.init_config import fill_config
 from spacy.util import load_config
 from spacy.lang.en import English
 from thinc.api import Config
 from ..util import make_tempdir
 def test_issue7055():
    """Test that fill-config doesn't turn sourced components into factories."""
    source_cfg = {
        "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]},
        "components": {
            "tok2vec": {"factory": "tok2vec"},
            "tagger": {"factory": "tagger"},
        },
    }
    source_nlp = English.from_config(source_cfg)
    with make_tempdir() as dir_path:
        # We need to create a loadable source pipeline
        source_path = dir_path / "test_model"
        source_nlp.to_disk(source_path)
        base_cfg = {
            "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
            "components": {
                "tok2vec": {"source": str(source_path)},
                "tagger": {"source": str(source_path)},
                "ner": {"factory": "ner"},
            },
        }
        base_cfg = Config(base_cfg)
        base_path = dir_path / "base.cfg"
        base_cfg.to_disk(base_path)
        output_path = dir_path / "config.cfg"
        fill_config(output_path, base_path, silent=True)
        filled_cfg = load_config(output_path)
    assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path)
    assert filled_cfg["components"]["tagger"]["source"] == str(source_path)
    assert filled_cfg["components"]["ner"]["factory"] == "ner"
    assert "model" in filled_cfg["components"]["ner"]
--- a/spacy/tests/regression/test_issue7056.py
+++ b/spacy/tests/regression/test_issue7056.py
@ -0,0 +1,27 @@
 import pytest
 from spacy.tokens.doc import Doc
 from spacy.vocab import Vocab
 from spacy.pipeline._parser_internals.arc_eager import ArcEager
 def test_issue7056():
    """Test that the Unshift transition works properly, and doesn't cause
    sentence segmentation errors."""
    vocab = Vocab()
    ae = ArcEager(
        vocab.strings,
        ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"])
    )
    doc = Doc(vocab, words="Severe pain , after trauma".split())
    state = ae.init_batch([doc])[0]
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "L-amod")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "S")
    ae.apply_transition(state, "R-pobj")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    ae.apply_transition(state, "D")
    assert not state.eol()
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,5 +1,33 @@
 {
    "resources": [
        {
            "id": "spacy-dbpedia-spotlight",
            "title": "DBpedia Spotlight for SpaCy",
            "slogan": "Use DBpedia Spotlight to link entities inside SpaCy",
            "description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).",
            "github": "MartinoMensio/spacy-dbpedia-spotlight",
            "pip": "spacy-dbpedia-spotlight",
            "code_example": [
                "import spacy_dbpedia_spotlight",
                "# load your model as usual",
                "nlp = spacy.load('en_core_web_lg')",
                "# add the pipeline stage",
                "nlp.add_pipe('dbpedia_spotlight')",
                "# get the document",
                "doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')",
                "# see the entities",
                "print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])",
                "# inspect the raw data from DBpedia spotlight",
                "print(doc.ents[0]._.dbpedia_raw_result)"
            ],
            "category": ["models", "pipeline"],
            "author": "Martino Mensio",
            "author_links": {
                "twitter": "MartinoMensio",
                "github": "MartinoMensio",
                "website": "https://martinomensio.github.io"
            }
        },
        {
            "id": "spacy-textblob",
            "title": "spaCyTextBlob",