Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-07-03 15:15:58 +02:00
commit 949d4a0a0b
25 changed files with 401 additions and 313 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a0"
__version__ = "3.0.0a1"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -9,7 +9,7 @@ import sys
from ._app import app, Arg, Opt
from ..gold import docs_to_json
from ..tokens import DocBin
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
# Converters are matched by file extension except for ner/iob, which are
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
# imported from /converters.
CONVERTERS = {
# "conllubio": conllu2docs, TODO
# "conllu": conllu2docs, TODO
# "conll": conllu2docs, TODO
"conllubio": conllu2docs,
"conllu": conllu2docs,
"conll": conllu2docs,
"ner": conll_ner2docs,
"iob": iob2docs,
"json": json2docs,
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
if output_type == "json":
srsly.write_json("-", docs_to_json(docs))
else:
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
def _write_docs_to_file(docs, output_file, output_type):
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
if output_type == "json":
srsly.write_json(output_file, docs_to_json(docs))
else:
data = DocBin(docs=docs).to_bytes()
data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)

View File

@ -56,6 +56,7 @@ def init_model_cli(
freqs_loc=freqs_loc,
clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc,
vectors_loc=vectors_loc,
prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors,
vectors_name=vectors_name,
@ -228,7 +229,7 @@ def add_vectors(
else:
if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
msg.good(f"Loaded vectors from {vectors_loc}")
else:
vectors_data, vector_keys = (None, None)
@ -247,7 +248,7 @@ def add_vectors(
nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1:

View File

@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc
from ..attrs import ID, HEAD
from .. import util
from ..gold import Example
@app.command("pretrain")
@ -183,7 +182,7 @@ def pretrain(
for batch_id, batch in enumerate(batches):
docs, count = make_docs(
nlp,
[ex.doc for ex in batch],
batch,
max_length=pretrain_config["max_length"],
min_length=pretrain_config["min_length"],
)

View File

@ -159,6 +159,8 @@ class Warnings(object):
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
@add_codes
@ -556,8 +558,8 @@ class Errors(object):
E979 = ("Cannot convert {type} to an Example object.")
E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.")
E981 = ("The offsets of the annotations for 'links' need to refer exactly "
"to the offsets of the 'entities' annotations.")
E981 = ("The offsets of the annotations for 'links' could not be aligned "
"to token boundaries.")
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
@ -593,6 +595,8 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
@add_codes

View File

@ -1,6 +1,4 @@
from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs
# TODO: Update this one
# from .conllu2docs import conllu2docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401

View File

@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language
from ...tokens import Doc, Token
from ...tokens import Doc, Token, Span
from wasabi import Printer
def conllu2json(
def conllu2docs(
input_data,
n_sents=10,
append_morphology=False,
@ -28,34 +28,22 @@ def conllu2json(
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents)
docs = []
raw = ""
sentences = []
conll_data = read_conllx(
sent_docs = read_conllx(
input_data,
append_morphology=append_morphology,
ner_tag_pattern=MISC_NER_PATTERN,
ner_map=ner_map,
merge_subtokens=merge_subtokens,
)
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
for i, example in enumerate(conll_data):
raw += example.text
sentences.append(
generate_sentence(
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
)
)
# Real-sized documents could be extracted using the comments on the
# conllu document
if len(sentences) % n_sents == 0:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
raw = ""
sentences = []
if sentences:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
docs = []
sent_docs_to_merge = []
for sent_doc in sent_docs:
sent_docs_to_merge.append(sent_doc)
if len(sent_docs_to_merge) % n_sents == 0:
docs.append(Doc.from_docs(sent_docs_to_merge))
sent_docs_to_merge = []
if sent_docs_to_merge:
docs.append(Doc.from_docs(sent_docs_to_merge))
return docs
@ -84,14 +72,14 @@ def read_conllx(
ner_tag_pattern="",
ner_map=None,
):
""" Yield examples, one for each sentence """
""" Yield docs, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n")
if lines:
while lines[0].startswith("#"):
lines.pop(0)
example = example_from_conllu_sentence(
doc = doc_from_conllu_sentence(
vocab,
lines,
ner_tag_pattern,
@ -99,7 +87,7 @@ def read_conllx(
append_morphology=append_morphology,
ner_map=ner_map,
)
yield example
yield doc
def get_entities(lines, tag_pattern, ner_map=None):
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
return iob_to_biluo(iob)
def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
sentence = {}
tokens = []
token_annotation = example_dict["token_annotation"]
for i, id_ in enumerate(token_annotation["ids"]):
token = {}
token["id"] = id_
token["orth"] = token_annotation["words"][i]
token["tag"] = token_annotation["tags"][i]
token["pos"] = token_annotation["pos"][i]
token["lemma"] = token_annotation["lemmas"][i]
token["morph"] = token_annotation["morphs"][i]
token["head"] = token_annotation["heads"][i] - i
token["dep"] = token_annotation["deps"][i]
if has_ner_tags:
token["ner"] = example_dict["doc_annotation"]["entities"][i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
def example_from_conllu_sentence(
def doc_from_conllu_sentence(
vocab,
lines,
ner_tag_pattern,
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc)
# create Example from custom Doc annotation
words, spaces, tags, morphs, lemmas = [], [], [], [], []
# create final Doc from custom Doc annotation
words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
heads, deps = [], []
for i, t in enumerate(doc):
words.append(t._.merged_orth)
lemmas.append(t._.merged_lemma)
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
tags.append(t.tag_ + "__" + t._.merged_morph)
else:
tags.append(t.tag_)
poses.append(t.pos_)
heads.append(t.head.i)
deps.append(t.dep_)
doc_x = Doc(vocab, words=words, spaces=spaces)
ref_dict = Example(doc_x, reference=doc).to_dict()
ref_dict["words"] = words
ref_dict["lemmas"] = lemmas
ref_dict["spaces"] = spaces
ref_dict["tags"] = tags
ref_dict["morphs"] = morphs
example = Example.from_dict(doc_x, ref_dict)
return example
for i in range(len(doc)):
doc_x[i].tag_ = tags[i]
doc_x[i].morph_ = morphs[i]
doc_x[i].lemma_ = lemmas[i]
doc_x[i].pos_ = poses[i]
doc_x[i].dep_ = deps[i]
doc_x[i].head = doc_x[heads[i]]
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
doc_x.is_parsed = True
doc_x.is_tagged = True
return doc_x
def merge_conllu_subtokens(lines, doc):

View File

@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if json_para.get("raw"):
assert tok_dict.get("SPACY")
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc)
return docs

View File

@ -43,24 +43,35 @@ class Corpus:
locs.append(path)
return locs
def _make_example(self, nlp, reference, gold_preproc):
if gold_preproc or reference.has_unknown_spaces:
return Example(
Doc(
nlp.vocab,
words=[word.text for word in reference],
spaces=[bool(word.whitespace_) for word in reference]
),
reference
)
else:
return Example(
nlp.make_doc(reference.text),
reference
)
def make_examples(self, nlp, reference_docs, max_length=0):
for reference in reference_docs:
if len(reference) == 0:
continue
elif max_length == 0 or len(reference) < max_length:
yield Example(
nlp.make_doc(reference.text),
reference
)
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
elif max_length == 0 or len(ref_sent) < max_length:
yield Example(
nlp.make_doc(ref_sent.text),
ref_sent.as_doc()
)
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(self, nlp, reference_docs):
for reference in reference_docs:
@ -69,14 +80,7 @@ class Corpus:
else:
ref_sents = [reference]
for ref_sent in ref_sents:
eg = Example(
Doc(
nlp.vocab,
words=[w.text for w in ref_sent],
spaces=[bool(w.whitespace_) for w in ref_sent]
),
ref_sent
)
eg = self._make_example(nlp, ref_sent, True)
if len(eg.x):
yield eg

View File

@ -15,7 +15,7 @@ from ..syntax import nonproj
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
""" Create a Doc from dictionaries with token and doc annotations. """
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if "entities" in doc_annot:
@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key == "entities":
pass
elif key == "links":
entities = doc_annot.get("entities", {})
if not entities:
raise ValueError(Errors.E981)
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
tok_annot["ENT_KB_ID"] = ent_kb_ids
elif key == "cats":
pass
@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("")
return ent_iobs, ent_types
def _parse_links(vocab, words, links, entities):
reference = Doc(vocab, words=words)
def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference}
ends = {token.idx + len(token): token.i for token in reference}
ent_kb_ids = ["" for _ in reference]
entity_map = [(ent[0], ent[1]) for ent in entities]
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
for index, annot_dict in links.items():
start_char, end_char = index
if (start_char, end_char) not in entity_map:
raise ValueError(Errors.E981)
for index, annot_dict in links.items():
true_kb_ids = []
@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities):
start_char, end_char = index
start_token = starts.get(start_char)
end_token = ends.get(end_char)
if start_token is None or end_token is None:
raise ValueError(Errors.E981)
for i in range(start_token, end_token+1):
ent_kb_ids[i] = true_kb_ids[0]
@ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities):
def _guess_spaces(text, words):
if text is None:
return [True] * len(words)
return None
spaces = []
text_pos = 0
# align words with text

View File

@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
assert new_doc.is_parsed
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "They don't think alike."]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
docs_idx = en_texts[0].index('docs')
de_doc = de_tokenizer(de_text)
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc]))
with pytest.raises(ValueError):
Doc.from_docs(en_docs + [de_doc])
m_doc = Doc.from_docs(en_docs)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts)
p_token = m_doc[len(en_docs[0])-1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
with pytest.raises(AttributeError):
not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == "".join(en_texts)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing
assert list(m_doc.sents)
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en"

View File

@ -75,3 +75,19 @@ def test_serialize_doc_bin():
for i, doc in enumerate(reloaded_docs):
assert doc.text == texts[i]
assert doc.cats == cats
def test_serialize_doc_bin_unknown_spaces(en_vocab):
doc1 = Doc(en_vocab, words=["that", "'s"])
assert doc1.has_unknown_spaces
assert doc1.text == "that 's "
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
assert not doc2.has_unknown_spaces
assert doc2.text == "that's"
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
assert re_doc1.has_unknown_spaces
assert re_doc1.text == "that 's "
assert not re_doc2.has_unknown_spaces
assert re_doc2.text == "that's"

View File

@ -1,14 +1,10 @@
import pytest
from spacy.gold import docs_to_json
from spacy.gold.converters import iob2docs, conll_ner2docs
from spacy.gold.converters.conllu2json import conllu2json
from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.cli.pretrain import make_docs
# TODO
# from spacy.gold.converters import conllu2docs
def test_cli_converters_conllu2json():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
]
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1)
assert len(converted) == 1
converted_docs = conllu2docs(input_data, n_sents=1)
assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@pytest.mark.parametrize(
@ -55,8 +54,9 @@ def test_cli_converters_conllu2json():
)
def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1
converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens():
@ -82,10 +84,12 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
converted = conllu2json(
converted_docs = conllu2docs(
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
)
assert len(converted) == 1
assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json(en_vocab):

View File

@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
"links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
}
],
)

View File

@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
class DocBin(object):
@ -31,6 +31,7 @@ class DocBin(object):
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data
"version": str, # DocBin version number
}
Strings for the words, tags, labels etc are represented by 64-bit hashes in
@ -53,12 +54,14 @@ class DocBin(object):
DOCS: https://spacy.io/api/docbin#init
"""
attrs = sorted([intify_attr(attr) for attr in attrs])
self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
self.tokens = []
self.spaces = []
self.cats = []
self.user_data = []
self.flags = []
self.strings = set()
self.store_user_data = store_user_data
for doc in docs:
@ -83,12 +86,17 @@ class DocBin(object):
assert array.shape[0] == spaces.shape[0] # this should never happen
spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.flags.append({
"has_unknown_spaces": doc.has_unknown_spaces
})
for token in doc:
self.strings.add(token.text)
self.strings.add(token.tag_)
self.strings.add(token.lemma_)
self.strings.add(token.morph_)
self.strings.add(token.dep_)
self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_)
self.cats.append(doc.cats)
if self.store_user_data:
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@ -105,8 +113,11 @@ class DocBin(object):
vocab[string]
orth_col = self.attrs.index(ORTH)
for i in range(len(self.tokens)):
flags = self.flags[i]
tokens = self.tokens[i]
spaces = self.spaces[i]
if flags.get("has_unknown_spaces"):
spaces = None
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
doc = doc.from_array(self.attrs, tokens)
doc.cats = self.cats[i]
@ -130,6 +141,7 @@ class DocBin(object):
self.spaces.extend(other.spaces)
self.strings.update(other.strings)
self.cats.extend(other.cats)
self.flags.extend(other.flags)
if self.store_user_data:
self.user_data.extend(other.user_data)
@ -147,12 +159,14 @@ class DocBin(object):
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = {
"version": self.version,
"attrs": self.attrs,
"tokens": tokens.tobytes("C"),
"spaces": spaces.tobytes("C"),
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings),
"cats": self.cats,
"flags": self.flags,
}
if self.store_user_data:
msg["user_data"] = self.user_data
@ -178,6 +192,7 @@ class DocBin(object):
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
self.cats = msg["cats"]
self.flags = msg.get("flags", [{} for _ in lengths])
if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"])
for tokens in self.tokens:

View File

@ -59,11 +59,14 @@ cdef class Doc:
cdef public dict user_token_hooks
cdef public dict user_span_hooks
cdef public bint has_unknown_spaces
cdef public list _py_tokens
cdef int length
cdef int max_length
cdef public object noun_chunks_iterator
cdef object __weakref__

View File

@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t
import copy
from collections import Counter
import numpy
import numpy.linalg
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS
from ..attrs import intify_attr, intify_attrs, IDS
from ..util import normalize_slice
from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings
@ -171,8 +172,7 @@ cdef class Doc:
raise ValueError(Errors.E046.format(name=name))
return Underscore.doc_extensions.pop(name)
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
orths_and_spaces=None):
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you
@ -214,27 +214,24 @@ cdef class Doc:
self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef bint has_space
if orths_and_spaces is None and words is not None:
if spaces is None:
spaces = [True] * len(words)
elif len(spaces) != len(words):
raise ValueError(Errors.E027)
orths_and_spaces = zip(words, spaces)
cdef const LexemeC* lexeme
if orths_and_spaces is not None:
orths_and_spaces = list(orths_and_spaces)
for orth_space in orths_and_spaces:
if isinstance(orth_space, unicode):
lexeme = self.vocab.get(self.mem, orth_space)
has_space = True
elif isinstance(orth_space, bytes):
raise ValueError(Errors.E028.format(value=orth_space))
elif isinstance(orth_space[0], unicode):
lexeme = self.vocab.get(self.mem, orth_space[0])
has_space = orth_space[1]
if words is None and spaces is not None:
raise ValueError("words must be set if spaces is set")
elif spaces is None and words is not None:
self.has_unknown_spaces = True
else:
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
has_space = orth_space[1]
self.has_unknown_spaces = False
words = words if words is not None else []
spaces = spaces if spaces is not None else ([True] * len(words))
if len(spaces) != len(words):
raise ValueError(Errors.E027)
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, unicode):
lexeme = self.vocab.get(self.mem, word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
lexeme = self.vocab.get_by_orth(self.mem, word)
self.push_back(lexeme, has_space)
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
# There's no information we'd like to add to it, so I guess so?
@ -806,7 +803,7 @@ cdef class Doc:
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs]
if array.dtype != numpy.uint64:
warnings.warn(Warnings.W028.format(type=array.dtype))
warnings.warn(Warnings.W101.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032)
@ -882,6 +879,87 @@ cdef class Doc:
set_children_from_heads(self.c, length)
return self
@staticmethod
def from_docs(docs, ensure_whitespace=True, attrs=None):
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
the same `Vocab`.
docs (list): A list of Doc objects.
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
attrs (list): Optional list of attribute ID ints or attribute name strings.
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
DOCS: https://spacy.io/api/doc#from_docs
"""
if not docs:
return None
vocab = {doc.vocab for doc in docs}
if len(vocab) > 1:
raise ValueError(Errors.E999)
(vocab,) = vocab
if attrs is None:
attrs = [LEMMA, NORM]
if all(doc.is_nered for doc in docs):
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
# TODO: separate for is_morphed?
if all(doc.is_tagged for doc in docs):
attrs.extend([TAG, POS, MORPH])
if all(doc.is_parsed for doc in docs):
attrs.extend([HEAD, DEP])
else:
attrs.append(SENT_START)
else:
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present
if SPACY not in attrs:
attrs.append(SPACY)
concat_words = []
concat_spaces = []
concat_user_data = {}
char_offset = 0
for doc in docs:
concat_words.extend(t.text for t in doc)
concat_spaces.extend(bool(t.whitespace_) for t in doc)
for key, value in doc.user_data.items():
if isinstance(key, tuple) and len(key) == 4:
data_type, name, start, end = key
if start is not None or end is not None:
start += char_offset
if end is not None:
end += char_offset
concat_user_data[(data_type, name, start, end)] = copy.copy(value)
else:
warnings.warn(Warnings.W101.format(name=name))
else:
warnings.warn(Warnings.W102.format(key=key, value=value))
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
arrays = [doc.to_array(attrs) for doc in docs]
if ensure_whitespace:
spacy_index = attrs.index(SPACY)
for i, array in enumerate(arrays[:-1]):
if len(array) > 0 and not docs[i][-1].is_space:
array[-1][spacy_index] = 1
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
if not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)
concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
concat_doc.from_array(attrs, concat_array)
return concat_doc
def get_lca_matrix(self):
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among
@ -1000,6 +1078,7 @@ cdef class Doc:
"sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor,
"cats": lambda: self.cats,
"has_unknown_spaces": lambda: self.has_unknown_spaces
}
for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@ -1032,6 +1111,7 @@ cdef class Doc:
"cats": lambda b: None,
"user_data_keys": lambda b: None,
"user_data_values": lambda b: None,
"has_unknown_spaces": lambda b: None
}
for key in kwargs:
if key in deserializers or key in ("user_data",):
@ -1052,6 +1132,8 @@ cdef class Doc:
self.tensor = msg["tensor"]
if "cats" not in exclude and "cats" in msg:
self.cats = msg["cats"]
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
self.has_unknown_spaces = msg["has_unknown_spaces"]
start = 0
cdef const LexemeC* lex
cdef unicode orth_
@ -1123,50 +1205,6 @@ cdef class Doc:
remove_label_if_necessary(attributes[i])
retokenizer.merge(span, attributes[i])
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
# TODO: ENT_KB_ID ?
if len(args) == 3:
warnings.warn(Warnings.W003, DeprecationWarning)
tag, lemma, ent_type = args
attributes[TAG] = tag
attributes[LEMMA] = lemma
attributes[ENT_TYPE] = ent_type
elif not args:
fix_attributes(self, attributes)
elif args:
raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
kwargs=repr(attributes)))
remove_label_if_necessary(attributes)
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
return None
# Currently we have the token index, we want the range-end index
end += 1
with self.retokenize() as retokenizer:
retokenizer.merge(self[start:end], attrs=attributes)
return self[start]
def print_tree(self, light=False, flat=False):
raise ValueError(Errors.E105)
def to_json(self, underscore=None):
"""Convert a Doc to JSON. The format it produces will be the new format
for the `spacy train` command (not implemented yet).

View File

@ -280,18 +280,6 @@ cdef class Span:
return array
def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single
token.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
def get_lca_matrix(self):
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
`Span`, where LCA[i, j] is the index of the lowest common ancestor among

View File

@ -349,6 +349,33 @@ array of attributes.
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | Itself. |
## Doc.from_docs {#from_docs tag="staticmethod"}
Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
> #### Example
>
> ```python
> from spacy.tokens import Doc
> texts = ["London is the capital of the United Kingdom.",
> "The River Thames flows through London.",
> "The famous Tower Bridge crosses the River Thames."]
> docs = list(nlp.pipe(texts))
> c_doc = Doc.from_docs(docs)
> assert str(c_doc) == " ".join(texts)
> assert len(list(c_doc.sents)) == len(docs)
> assert [str(ent) for ent in c_doc.ents] == \
> [str(ent) for doc in docs for ent in doc.ents]
> ```
| Name | Type | Description |
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
| `docs` | list | A list of `Doc` objects. |
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. |
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. |
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
## Doc.to_disk {#to_disk tag="method" new="2"}
Save the current state to a directory.

View File

@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure:
```python
### msgpack object strcutrue
### msgpack object structrue
{
"version": str, # DocBin version number
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
| Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. |

View File

@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
## Matcher.\_\_call\_\_ {#call tag="method"}
Find all token sequences matching the supplied patterns on the `Doc`. As of
spaCy v2.3, the `Matcher` can also be called on `Span` objects.
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> #### Example
>
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
>
> matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern)
> matcher.add("HelloWorld", [pattern])
> doc = nlp("hello world!")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). |
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
<Infobox title="Important note" variant="warning">
By default, the matcher **does not perform any action** on matches, like tagging
matched phrases with entity types. Instead, actions need to be specified when
**adding patterns or entities**, by passing in a callback function as the
`on_match` argument on [`add`](/api/matcher#add). This allows you to define
custom actions per pattern within the same matcher. For example, you might only
want to merge some entity types, and set custom flags for other matched
patterns. For more details and examples, see the usage guide on
[rule-based matching](/usage/rule-based-matching).
</Infobox>
## Matcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
@ -92,7 +78,7 @@ patterns.
> ```python
> matcher = Matcher(nlp.vocab)
> assert len(matcher) == 0
> matcher.add("Rule", None, [{"ORTH": "test"}])
> matcher.add("Rule", [[{"ORTH": "test"}]])
> assert len(matcher) == 1
> ```
@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
>
> ```python
> matcher = Matcher(nlp.vocab)
> assert 'Rule' not in matcher
> matcher.add('Rule', None, [{'ORTH': 'test'}])
> assert 'Rule' in matcher
> assert "Rule" not in matcher
> matcher.add("Rule", [[{'ORTH': 'test'}]])
> assert "Rule" in matcher
> ```
| Name | Type | Description |
@ -133,35 +119,35 @@ overwritten.
> print('Matched!', matches)
>
> matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
> patterns = [
> [{"LOWER": "hello"}, {"LOWER": "world"}],
> [{"ORTH": "Google"}, {"ORTH": "Maps"}]
> ]
> matcher.add("TEST_PATTERNS", patterns)
> doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
<Infobox title="Changed in v3.0" variant="warning">
<Infobox title="Changed in v2.2.2" variant="warning">
As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
the default in the future. The patterns are now the second argument and a list
As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
(instead of a variable number of arguments). The `on_match` callback becomes an
optional keyword argument.
```diff
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
- matcher.add("GoogleNow", None, *patterns)
+ matcher.add("GoogleNow", patterns)
- matcher.add("GoogleNow", on_match, *patterns)
+ matcher.add("GoogleNow", patterns, on_match=on_match)
```
</Infobox>
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## Matcher.remove {#remove tag="method" new="2"}
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@ -170,7 +156,7 @@ exist.
> #### Example
>
> ```python
> matcher.add("Rule", None, [{"ORTH": "test"}])
> matcher.add("Rule", [[{"ORTH": "test"}]])
> assert "Rule" in matcher
> matcher.remove("Rule")
> assert "Rule" not in matcher
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> #### Example
>
> ```python
> matcher.add("Rule", None, [{"ORTH": "test"}])
> matcher.add("Rule", [[{"ORTH": "test"}]])
> on_match, patterns = matcher.get("Rule")
> ```

View File

@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher
>
> matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> matcher.add("OBAMA", [nlp("Barack Obama")])
> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc)
> ```
@ -104,7 +104,7 @@ patterns.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert len(matcher) == 1
> ```
@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher
> ```
@ -145,36 +145,32 @@ overwritten.
> print('Matched!', matches)
>
> matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
> matcher.add("HEALTH", on_match, nlp("health care reform"),
> nlp("healthcare reform"))
> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
<Infobox title="Changed in v3.0" variant="warning">
<Infobox title="Changed in v2.2.2" variant="warning">
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
become the default in the future. The `Doc` patterns are now the second argument
and a list (instead of a variable number of arguments). The `on_match` callback
As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
argument (instead of a variable number of arguments). The `on_match` callback
becomes an optional keyword argument.
```diff
patterns = [nlp("health care reform"), nlp("healthcare reform")]
- matcher.add("HEALTH", None, *patterns)
+ matcher.add("HEALTH", patterns)
- matcher.add("HEALTH", on_match, *patterns)
+ matcher.add("HEALTH", patterns, on_match=on_match)
```
</Infobox>
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `docs` | list | `Doc` objects of the phrases to match. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@ -184,7 +180,7 @@ does not exist.
>
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher
> matcher.remove("OBAMA")
> assert "OBAMA" not in matcher

View File

@ -407,7 +407,7 @@ class EntityMatcher(object):
def __init__(self, nlp, terms, label):
patterns = [nlp.make_doc(text) for text in terms]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add(label, None, *patterns)
self.matcher.add(label, patterns)
def __call__(self, doc):
matches = self.matcher(doc)

View File

@ -98,9 +98,7 @@ print([token.text for token in doc])
First, we initialize the `Matcher` with a vocab. The matcher must always share
the same vocab with the documents it will operate on. We can now call
[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
second argument lets you pass in an optional callback function to invoke on a
successful match. For now, we set it to `None`.
[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
```python
### {executable="true"}
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
matcher.add("HelloWorld", [pattern])
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
also match sequences without punctuation between "hello" and "world":
```python
matcher.add("HelloWorld", None,
patterns = [
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
[{"LOWER": "hello"}, {"LOWER": "world"}])
[{"LOWER": "hello"}, {"LOWER": "world"}]
]
matcher.add("HelloWorld", patterns)
```
By default, the matcher will only return the matches and **not do anything
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True)
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
matcher.add("HelloWorld", None, pattern)
matcher.add("HelloWorld", [pattern])
# 🚨 Raises an error:
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0:
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
print(entity.text)
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)
```
@ -509,19 +509,18 @@ import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
# We're using a class because the component needs to be initialised with
# We're using a class because the component needs to be initialized with
# the shared vocab via the nlp object
class BadHTMLMerger(object):
def __init__(self, nlp):
patterns = [
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
]
# Register a new token extension to flag bad HTML
Token.set_extension("bad_html", default=False)
self.matcher = Matcher(nlp.vocab)
self.matcher.add(
"BAD_HTML",
None,
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
)
self.matcher.add("BAD_HTML", patterns)
def __call__(self, doc):
# This method is invoked when the component is called on a Doc
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern) # add pattern
matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern
doc = nlp("I'd say that Facebook is evil. Facebook is pretty cool, right?")
matches = matcher(doc)
@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern)
matcher.add("PHONE_NUMBER", [pattern])
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
elif doc.vocab.strings[match_id] == "SAD":
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern
matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)
matcher.add("TerminologyList", patterns)
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
"converse in the Oval Office inside the White House in Washington, D.C.")
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", None, *patterns)
matcher.add("Names", patterns)
doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):

View File

@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji
doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc)