mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
ae9fac2d87
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Knowledge base: https://spacy.io/api/kb
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
|
|
@ -8,8 +8,8 @@ For more details, see the documentation:
|
|||
* Training: https://spacy.io/usage/training
|
||||
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
||||
|
||||
Compatible with: spaCy v2.2
|
||||
Last tested with: v2.2
|
||||
Compatible with: spaCy v2.2.3
|
||||
Last tested with: v2.2.3
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
|
|||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||
|
||||
# create the built-in pipeline components and add them to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
entity_linker = nlp.create_pipe("entity_linker")
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
else:
|
||||
entity_linker = nlp.get_pipe("entity_linker")
|
||||
kb = entity_linker.kb
|
||||
|
||||
# make sure the annotated examples correspond to known identifiers in the knowlege base
|
||||
kb_ids = kb.get_entity_strings()
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
|
||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
||||
TRAIN_DOCS = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
with nlp.disable_pipes("entity_linker"):
|
||||
doc = nlp(text)
|
||||
annotation_clean = annotation
|
||||
for offset, kb_id_dict in annotation["links"].items():
|
||||
new_dict = {}
|
||||
for kb_id, value in kb_id_dict.items():
|
||||
|
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
print(
|
||||
"Removed", kb_id, "from training because it is not in the KB."
|
||||
)
|
||||
annotation["links"][offset] = new_dict
|
||||
annotation_clean["links"][offset] = new_dict
|
||||
TRAIN_DOCS.append((doc, annotation_clean))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
||||
|
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
# reset and initialize the weights randomly
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(TRAIN_DOCS)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
|
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
|
||||
def _apply_model(nlp):
|
||||
for text, annotation in TRAIN_DATA:
|
||||
doc = nlp.tokenizer(text)
|
||||
|
||||
# set entities so the evaluation is independent of the NER step
|
||||
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
|
||||
rc_ent = Span(doc, 0, 2, label=PERSON)
|
||||
doc.ents = [rc_ent]
|
||||
|
||||
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities
|
||||
doc = nlp.get_pipe("entity_linker")(doc)
|
||||
|
||||
doc = nlp(text)
|
||||
print()
|
||||
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
blis>=0.4.0,<0.5.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
|
|
|
@ -38,13 +38,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=7.3.0,<7.4.0
|
||||
thinc==7.4.0.dev0
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
|
|
|
@ -81,7 +81,8 @@ class Warnings(object):
|
|||
"Future versions may introduce a `n_process` argument for "
|
||||
"parallel inference via multiprocessing.")
|
||||
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
|
||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
||||
"ignoring the duplicate entry.")
|
||||
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
||||
"previously loaded vectors. See Issue #3853.")
|
||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
||||
|
@ -531,6 +532,9 @@ class Errors(object):
|
|||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||
"make sure the gold EL data refers to valid results of the "
|
||||
"named entity recognizer in the `nlp` pipeline.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
11
spacy/kb.pyx
11
spacy/kb.pyx
|
@ -136,19 +136,24 @@ cdef class KnowledgeBase:
|
|||
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
|
||||
raise ValueError(Errors.E140)
|
||||
|
||||
nr_entities = len(entity_list)
|
||||
nr_entities = len(set(entity_list))
|
||||
self._entry_index = PreshMap(nr_entities+1)
|
||||
self._entries = entry_vec(nr_entities+1)
|
||||
|
||||
i = 0
|
||||
cdef KBEntryC entry
|
||||
cdef hash_t entity_hash
|
||||
while i < nr_entities:
|
||||
while i < len(entity_list):
|
||||
# only process this entity if its unique ID hadn't been added before
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
if entity_hash in self._entry_index:
|
||||
user_warning(Warnings.W018.format(entity=entity_list[i]))
|
||||
|
||||
else:
|
||||
entity_vector = vector_list[i]
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
|
||||
entity_hash = self.vocab.strings.add(entity_list[i])
|
||||
entry.entity_hash = entity_hash
|
||||
entry.freq = freq_list[i]
|
||||
|
||||
|
|
|
@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store):
|
|||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
elif isinstance(value, (dict, int)):
|
||||
elif isinstance(value, int):
|
||||
pass
|
||||
elif isinstance(value, dict):
|
||||
continue
|
||||
else:
|
||||
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
|
||||
|
|
|
@ -1142,7 +1142,7 @@ cdef class EntityRecognizer(Parser):
|
|||
|
||||
@component(
|
||||
"entity_linker",
|
||||
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"]
|
||||
)
|
||||
class EntityLinker(Pipe):
|
||||
|
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
|
|||
for entity, kb_dict in gold.links.items():
|
||||
start, end = entity
|
||||
mention = doc.text[start:end]
|
||||
|
||||
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
|
||||
if not (start, end) in ents_by_offset:
|
||||
raise RuntimeError(Errors.E188)
|
||||
ent = ents_by_offset[(start, end)]
|
||||
|
||||
for kb_id, value in kb_dict.items():
|
||||
# Currently only training on the positive instances
|
||||
if value:
|
||||
try:
|
||||
sentence_docs.append(ent.sent.as_doc())
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030)
|
||||
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
|
||||
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
|
||||
|
|
|
@ -69,7 +69,8 @@ cdef class ParserBeam(object):
|
|||
cdef StateC* st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, min_density=density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||
beam.initialize(self.moves.init_beam_state,
|
||||
self.moves.del_beam_state, state.c.length,
|
||||
state.c._sent)
|
||||
for i in range(beam.width):
|
||||
st = <StateC*>beam.at(i)
|
||||
|
|
|
@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
return <void*>st
|
||||
|
||||
|
||||
cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||
cdef StateC* st = <StateC*>state
|
||||
del st
|
||||
|
||||
|
||||
cdef class ArcEager(TransitionSystem):
|
||||
def __init__(self, *args, **kwargs):
|
||||
TransitionSystem.__init__(self, *args, **kwargs)
|
||||
self.init_beam_state = _init_state
|
||||
self.del_beam_state = _del_state
|
||||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
|
|
|
@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
|||
|
||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||
|
||||
ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
|
||||
|
||||
cdef class TransitionSystem:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
|
@ -42,6 +44,7 @@ cdef class TransitionSystem:
|
|||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
cdef del_state_t del_beam_state
|
||||
cdef public object labels
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil
|
||||
|
|
|
@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
|
|||
return <void*>st
|
||||
|
||||
|
||||
cdef int _del_state(Pool mem, void* state, void* x) except -1:
|
||||
cdef StateC* st = <StateC*>state
|
||||
del st
|
||||
|
||||
|
||||
cdef class TransitionSystem:
|
||||
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
|
||||
self.mem = Pool()
|
||||
|
@ -44,6 +49,7 @@ cdef class TransitionSystem:
|
|||
self.initialize_actions(labels_by_action, min_freq=min_freq)
|
||||
self.root_label = self.strings.add('ROOT')
|
||||
self.init_beam_state = _init_state
|
||||
self.del_beam_state = _del_state
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.strings, self.labels), None, None)
|
||||
|
@ -72,7 +78,8 @@ cdef class TransitionSystem:
|
|||
|
||||
for doc in docs:
|
||||
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
||||
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
||||
beam.initialize(self.init_beam_state, self.del_beam_state,
|
||||
doc.length, doc.c)
|
||||
for i in range(beam.width):
|
||||
state = <StateC*>beam.at(i)
|
||||
state.offset = offset
|
||||
|
|
|
@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
|
|||
return doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"i_sent,i,j,text",
|
||||
[
|
||||
(0, 0, len("This is a"), "This is a"),
|
||||
(1, 0, len("This is another"), "This is another"),
|
||||
(2, len("And "), len("And ") + len("a third"), "a third"),
|
||||
(0, 1, 2, None),
|
||||
],
|
||||
)
|
||||
def test_char_span(doc, i_sent, i, j, text):
|
||||
sents = list(doc.sents)
|
||||
span = sents[i_sent].char_span(i, j)
|
||||
if not text:
|
||||
assert not span
|
||||
else:
|
||||
assert span.text == text
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
|
34
spacy/tests/regression/test_issue4674.py
Normal file
34
spacy/tests/regression/test_issue4674.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.util import ensure_path
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||
|
||||
assert kb.get_size_entities() == 1
|
||||
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
|
@ -584,6 +584,22 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
start_idx += self.start_char
|
||||
end_idx += self.start_char
|
||||
return self.doc.char_span(start_idx, end_idx)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
"""Tokens that are conjoined to the span's root.
|
||||
|
|
|
@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx
|
|||
|
||||
A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
|
||||
named entities, export annotations to numpy arrays, losslessly serialize to
|
||||
compressed binary strings. The `Doc` object holds an array of `TokenC]` structs.
|
||||
compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
|
||||
The Python-level `Token` and [`Span`](/api/span) objects are views of this
|
||||
array, i.e. they don't own the data themselves.
|
||||
|
||||
|
|
|
@ -1261,6 +1261,21 @@
|
|||
},
|
||||
"category": ["podcasts"]
|
||||
},
|
||||
{
|
||||
"type": "education",
|
||||
"id": "practical-ai-podcast",
|
||||
"title": "Practical AI: Modern NLP with spaCy",
|
||||
"slogan": "December 2019",
|
||||
"description": "\"SpaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"",
|
||||
"thumb": "https://i.imgur.com/jn8Bcdw.png",
|
||||
"url": "https://changelog.com/practicalai/68",
|
||||
"author": "Daniel Whitenack & Chris Benson",
|
||||
"author_links": {
|
||||
"website": "https://changelog.com/practicalai",
|
||||
"twitter": "https://twitter.com/PracticalAIFM"
|
||||
},
|
||||
"category": ["podcasts"]
|
||||
},
|
||||
{
|
||||
"id": "adam_qas",
|
||||
"title": "ADAM: Question Answering System",
|
||||
|
|
Loading…
Reference in New Issue
Block a user