Merge branch 'master' into spacy.io

2025-08-05 21:00:19 +03:00 · 2019-12-13 15:57:49 +01:00 · 2019-12-13 15:57:49 +01:00 · ae9fac2d87
commit ae9fac2d87
parent e3ee88c99b c466e02466
17 changed files with 172 additions and 47 deletions
--- a/examples/training/pretrain_kb.py
+++ b/examples/training/pretrain_kb.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Knowledge base: https://spacy.io/api/kb
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy v2.2.3
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_entity_linker.py
+++ b/examples/training/train_entity_linker.py
@ -8,8 +8,8 @@ For more details, see the documentation:
 * Training: https://spacy.io/usage/training
 * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking

-Compatible with: spaCy v2.2
-Last tested with: v2.2
+Compatible with: spaCy v2.2.3
+Last tested with: v2.2.3
 """
 from __future__ import unicode_literals, print_function

@ -22,6 +22,7 @@ from spacy.vocab import Vocab

 import spacy
 from spacy.kb import KnowledgeBase
+from spacy.pipeline import EntityRuler
 from spacy.tokens import Span
 from spacy.util import minibatch, compounding

@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

-    # create the built-in pipeline components and add them to the pipeline
-    # nlp.create_pipe works for built-ins that are registered with spaCy
+    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+
+    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
+    # Note that in a realistic application, an actual NER algorithm should be used instead.
+    ruler = EntityRuler(nlp)
+    patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler)
+
+    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
-        entity_linker = nlp.create_pipe("entity_linker")
+        # use only the predicted EL score and not the prior probability (for demo purposes)
+        cfg = {"incl_prior": False}
+        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)
-    else:
-        entity_linker = nlp.get_pipe("entity_linker")
-        kb = entity_linker.kb

-    # make sure the annotated examples correspond to known identifiers in the knowlege base
-    kb_ids = kb.get_entity_strings()
+    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
+    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
+    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
+    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
+        with nlp.disable_pipes("entity_linker"):
+            doc = nlp(text)
+        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
                    print(
                        "Removed", kb_id, "from training because it is not in the KB."
                    )
-            annotation["links"][offset] = new_dict
+            annotation_clean["links"][offset] = new_dict
+        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
-            random.shuffle(TRAIN_DATA)
+            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
-            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
+            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):

 def _apply_model(nlp):
    for text, annotation in TRAIN_DATA:
-        doc = nlp.tokenizer(text)
-
-        # set entities so the evaluation is independent of the NER step
-        # all the examples contain 'Russ Cochran' as the first two tokens in the sentence
-        rc_ent = Span(doc, 0, 2, label=PERSON)
-        doc.ents = [rc_ent]
-
        # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
-        doc = nlp.get_pipe("entity_linker")(doc)
-
+        doc = nlp(text)
        print()
        print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=7.3.0,<7.4.0
+thinc==7.4.0.dev0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.4.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -38,13 +38,13 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=7.3.0,<7.4.0
+    thinc==7.4.0.dev0
 install_requires =
    # Our libraries
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=7.3.0,<7.4.0
+    thinc==7.4.0.dev0
    blis>=0.4.0,<0.5.0
    wasabi>=0.4.0,<1.1.0
    srsly>=0.1.0,<1.1.0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -81,7 +81,8 @@ class Warnings(object):
            "Future versions may introduce a `n_process` argument for "
            "parallel inference via multiprocessing.")
    W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
-    W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
+    W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
+            "ignoring the duplicate entry.")
    W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
            "previously loaded vectors. See Issue #3853.")
    W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
@ -531,6 +532,9 @@ class Errors(object):
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
    E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
    E187 = ("Only unicode strings are supported as labels.")
+    E188 = ("Could not match the gold entity links to entities in the doc - "
+            "make sure the gold EL data refers to valid results of the "
+            "named entity recognizer in the `nlp` pipeline.")


@add_codes
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -136,29 +136,34 @@ cdef class KnowledgeBase:
        if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
            raise ValueError(Errors.E140)

-        nr_entities = len(entity_list)
+        nr_entities = len(set(entity_list))
        self._entry_index = PreshMap(nr_entities+1)
        self._entries = entry_vec(nr_entities+1)

        i = 0
        cdef KBEntryC entry
        cdef hash_t entity_hash
-        while i < nr_entities:
-            entity_vector = vector_list[i]
-            if len(entity_vector) != self.entity_vector_length:
-                raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
-
+        while i < len(entity_list):
+            # only process this entity if its unique ID hadn't been added before
            entity_hash = self.vocab.strings.add(entity_list[i])
-            entry.entity_hash = entity_hash
-            entry.freq = freq_list[i]
+            if entity_hash in self._entry_index:
+                user_warning(Warnings.W018.format(entity=entity_list[i]))

-            vector_index = self.c_add_vector(entity_vector=vector_list[i])
-            entry.vector_index = vector_index
+            else:
+                entity_vector = vector_list[i]
+                if len(entity_vector) != self.entity_vector_length:
+                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))

-            entry.feats_row = -1   # Features table currently not implemented
+                entry.entity_hash = entity_hash
+                entry.freq = freq_list[i]

-            self._entries[i+1] = entry
-            self._entry_index[entity_hash] = i+1
+                vector_index = self.c_add_vector(entity_vector=vector_list[i])
+                entry.vector_index = vector_index
+
+                entry.feats_row = -1   # Features table currently not implemented
+
+                self._entries[i+1] = entry
+                self._entry_index[entity_hash] = i+1

            i += 1

--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store):
            value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
-        elif isinstance(value, (dict, int)):
+        elif isinstance(value, int):
+            pass
+        elif isinstance(value, dict):
            continue
        else:
            raise ValueError(Errors.E153.format(vtype=type(value).__name__))
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1142,7 +1142,7 @@ cdef class EntityRecognizer(Parser):

@component(
    "entity_linker",
-    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
    assigns=["token.ent_kb_id"]
 )
 class EntityLinker(Pipe):
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
            for entity, kb_dict in gold.links.items():
                start, end = entity
                mention = doc.text[start:end]
+
                # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
+                if not (start, end) in ents_by_offset:
+                    raise RuntimeError(Errors.E188)
                ent = ents_by_offset[(start, end)]

                for kb_id, value in kb_dict.items():
                    # Currently only training on the positive instances
                    if value:
-                        sentence_docs.append(ent.sent.as_doc())
+                        try:
+                            sentence_docs.append(ent.sent.as_doc())
+                        except AttributeError:
+                            # Catch the exception when ent.sent is None and provide a user-friendly warning
+                            raise RuntimeError(Errors.E030)

        sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
        loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
--- a/spacy/syntax/_beam_utils.pyx
+++ b/spacy/syntax/_beam_utils.pyx
@ -69,7 +69,8 @@ cdef class ParserBeam(object):
        cdef StateC* st
        for state in states:
            beam = Beam(self.moves.n_moves, width, min_density=density)
-            beam.initialize(self.moves.init_beam_state, state.c.length,
+            beam.initialize(self.moves.init_beam_state,
+                            self.moves.del_beam_state, state.c.length,
                            state.c._sent)
            for i in range(beam.width):
                st = <StateC*>beam.at(i)
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
    return <void*>st


+cdef int _del_state(Pool mem, void* state, void* x) except -1:
+    cdef StateC* st = <StateC*>state
+    del st
+
+
 cdef class ArcEager(TransitionSystem):
    def __init__(self, *args, **kwargs):
        TransitionSystem.__init__(self, *args, **kwargs)
        self.init_beam_state = _init_state
+        self.del_beam_state = _del_state

    @classmethod
    def get_actions(cls, **kwargs):
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil

 ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL

+ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
+
 cdef class TransitionSystem:
    cdef Pool mem
    cdef StringStore strings
@ -42,6 +44,7 @@ cdef class TransitionSystem:
    cdef public attr_t root_label
    cdef public freqs
    cdef init_state_t init_beam_state
+    cdef del_state_t del_beam_state
    cdef public object labels

    cdef int initialize_state(self, StateC* state) nogil
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
    return <void*>st


+cdef int _del_state(Pool mem, void* state, void* x) except -1:
+    cdef StateC* st = <StateC*>state
+    del st
+
+
 cdef class TransitionSystem:
    def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
        self.mem = Pool()
@ -44,6 +49,7 @@ cdef class TransitionSystem:
            self.initialize_actions(labels_by_action, min_freq=min_freq)
        self.root_label = self.strings.add('ROOT')
        self.init_beam_state = _init_state
+        self.del_beam_state = _del_state

    def __reduce__(self):
        return (self.__class__, (self.strings, self.labels), None, None)
@ -72,7 +78,8 @@ cdef class TransitionSystem:

        for doc in docs:
            beam = Beam(self.n_moves, beam_width, min_density=beam_density)
-            beam.initialize(self.init_beam_state, doc.length, doc.c)
+            beam.initialize(self.init_beam_state, self.del_beam_state,
+                            doc.length, doc.c)
            for i in range(beam.width):
                state = <StateC*>beam.at(i)
                state.offset = offset
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
    return doc


+@pytest.mark.parametrize(
+    "i_sent,i,j,text",
+    [
+        (0, 0, len("This is a"), "This is a"),
+        (1, 0, len("This is another"), "This is another"),
+        (2, len("And "), len("And ") + len("a third"), "a third"),
+        (0, 1, 2, None),
+    ],
+)
+def test_char_span(doc, i_sent, i, j, text):
+    sents = list(doc.sents)
+    span = sents[i_sent].char_span(i, j)
+    if not text:
+        assert not span
+    else:
+        assert span.text == text
+
+
 def test_spans_sent_spans(doc):
    sents = list(doc.sents)
    assert sents[0].start == 0
--- a/spacy/tests/regression/test_issue4674.py
+++ b/spacy/tests/regression/test_issue4674.py
@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.kb import KnowledgeBase
+from spacy.util import ensure_path
+
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+
+
+def test_issue4674():
+    """Test that setting entities with overlapping identifiers does not mess up IO"""
+    nlp = English()
+    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+
+    vector1 = [0.9, 1.1, 1.01]
+    vector2 = [1.8, 2.25, 2.01]
+    kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
+
+    assert kb.get_size_entities() == 1
+
+    # dumping to file & loading back in
+    with make_tempdir() as d:
+        dir_path = ensure_path(d)
+        if not dir_path.exists():
+            dir_path.mkdir()
+        file_path = dir_path / "kb"
+        kb.dump(str(file_path))
+
+        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
+        kb2.load_bulk(str(file_path))
+
+    assert kb2.get_size_entities() == 1
+
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -584,6 +584,22 @@ cdef class Span:
        else:
            return self.doc[root]

+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
+        """Create a `Span` object from the slice `span.text[start : end]`.
+
+        start (int): The index of the first character of the span.
+        end (int): The index of the first character after the span.
+        label (uint64 or string): A label to attach to the Span, e.g. for
+            named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
+            the span.
+        RETURNS (Span): The newly constructed object.
+        """
+        start_idx += self.start_char
+        end_idx += self.start_char
+        return self.doc.char_span(start_idx, end_idx)
+
    @property
    def conjuncts(self):
        """Tokens that are conjoined to the span's root.
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx

 A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
 named entities, export annotations to numpy arrays, losslessly serialize to
-compressed binary strings. The `Doc` object holds an array of `TokenC]` structs.
+compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
 The Python-level `Token` and [`Span`](/api/span) objects are views of this
 array, i.e. they don't own the data themselves.

--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1261,6 +1261,21 @@
            },
            "category": ["podcasts"]
        },
+        {
+            "type": "education",
+            "id": "practical-ai-podcast",
+            "title": "Practical AI: Modern NLP with spaCy",
+            "slogan": "December 2019",
+            "description": "\"SpaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"",
+            "thumb": "https://i.imgur.com/jn8Bcdw.png",
+            "url": "https://changelog.com/practicalai/68",
+            "author": "Daniel Whitenack & Chris Benson",
+            "author_links": {
+                "website": "https://changelog.com/practicalai",
+                "twitter": "https://twitter.com/PracticalAIFM"
+            },
+            "category": ["podcasts"]
+        },
        {
            "id": "adam_qas",
            "title": "ADAM: Question Answering System",