From 6c0c3d5ddc41b38baf929ce1188d6b2693fba51e Mon Sep 17 00:00:00 2001 From: Martino Mensio Date: Fri, 12 Feb 2021 19:06:51 +0100 Subject: [PATCH 1/3] added spacy-dbpedia-spotlight --- website/meta/universe.json | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index d5768d73b..45d146511 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,6 +1,34 @@ { "resources": [ - { + { + "id": "spacy-dbpedia-spotlight", + "title": "DBpedia Spotlight for SpaCy", + "slogan": "Use DBpedia Spotlight to link entities inside SpaCy", + "description": "This library links SpaCy with [DBpedia Spotlight](https://www.dbpedia-spotlight.org/). You can easily get the DBpedia entities from your documents, using the public web service or by using your own instance of DBpedia Spotlight. The `doc.ents` are populated with the entities and all their details (URI, type, ...).", + "github": "MartinoMensio/spacy-dbpedia-spotlight", + "pip": "spacy-dbpedia-spotlight", + "code_example": [ + "import spacy_dbpedia_spotlight", + "# load your model as usual", + "nlp = spacy.load('en_core_web_lg')", + "# add the pipeline stage", + "nlp.add_pipe('dbpedia_spotlight')", + "# get the document", + "doc = nlp('The president of USA is calling Boris Johnson to decide what to do about coronavirus')", + "# see the entities", + "print('Entities', [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])", + "# inspect the raw data from DBpedia spotlight", + "print(doc.ents[0]._.dbpedia_raw_result)" + ], + "category": ["models", "pipeline"], + "author": "Martino Mensio", + "author_links": { + "twitter": "MartinoMensio", + "github": "MartinoMensio", + "website": "https://martinomensio.github.io" + } + }, + { "id": "spacy-textblob", "title": "spaCyTextBlob", "slogan": "Easy sentiment analysis for spaCy using TextBlob", From 0fb8d437c09234ce913cb84982acfbdaf7b8c61d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 14 Feb 2021 13:38:13 +1100 Subject: [PATCH 2/3] Fix sentence fragments bug (#7056, #7035) (#7057) * Add test for #7035 * Update test for issue 7056 * Fix test * Fix transitions method used in testing * Fix state eol detection when rebuffer * Clean up redundant fix --- spacy/pipeline/_parser_internals/_state.pxd | 2 +- .../_parser_internals/transition_system.pyx | 2 -- spacy/tests/regression/test_issue7056.py | 27 +++++++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) create mode 100644 spacy/tests/regression/test_issue7056.py diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index a6bf926f9..161f3ca48 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -278,7 +278,7 @@ cdef cppclass StateC: return this._stack.size() int buffer_length() nogil const: - return this.length - this._b_i + return (this.length - this._b_i) + this._rebuffer.size() void push() nogil: b0 = this.B(0) diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 9bb4f7f5f..9e6f847eb 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -134,8 +134,6 @@ cdef class TransitionSystem: def is_valid(self, StateClass stcls, move_name): action = self.lookup_transition(move_name) - if action.move == 0: - return False return action.is_valid(stcls.c, action.label) cdef int set_valid(self, int* is_valid, const StateC* st) nogil: diff --git a/spacy/tests/regression/test_issue7056.py b/spacy/tests/regression/test_issue7056.py new file mode 100644 index 000000000..64a420b84 --- /dev/null +++ b/spacy/tests/regression/test_issue7056.py @@ -0,0 +1,27 @@ +import pytest + +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab +from spacy.pipeline._parser_internals.arc_eager import ArcEager + + +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, + ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() From f4f46b617f2106f51579bae2b71c71867d1cc7eb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 14 Feb 2021 14:02:14 +1100 Subject: [PATCH 3/3] Preserve sourced components in fill-config (fixes #7055) (#7058) --- spacy/cli/init_config.py | 4 +++ spacy/tests/regression/test_issue7055.py | 40 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 spacy/tests/regression/test_issue7055.py diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 6bdf393f6..9880c389c 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -103,6 +103,10 @@ def fill_config( # config result is a valid config nlp = util.load_model_from_config(nlp.config) filled = nlp.config + # If we have sourced components in the base config, those will have been + # replaced with their actual config after loading, so we have to re-add them + sourced = util.get_sourced_components(config) + filled["components"].update(sourced) if pretraining: validate_config_for_pretrain(filled, msg) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) diff --git a/spacy/tests/regression/test_issue7055.py b/spacy/tests/regression/test_issue7055.py new file mode 100644 index 000000000..c7ddb0a75 --- /dev/null +++ b/spacy/tests/regression/test_issue7055.py @@ -0,0 +1,40 @@ +from spacy.cli.init_config import fill_config +from spacy.util import load_config +from spacy.lang.en import English +from thinc.api import Config + +from ..util import make_tempdir + + +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"]