Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-07-03 15:15:58 +02:00
commit 949d4a0a0b
25 changed files with 401 additions and 313 deletions

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a0" __version__ = "3.0.0a1"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -9,7 +9,7 @@ import sys
from ._app import app, Arg, Opt from ._app import app, Arg, Opt
from ..gold import docs_to_json from ..gold import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..gold.converters import iob2docs, conll_ner2docs, json2docs from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
# "conllubio": conllu2docs, TODO "conllubio": conllu2docs,
# "conllu": conllu2docs, TODO "conllu": conllu2docs,
# "conll": conllu2docs, TODO "conll": conllu2docs,
"ner": conll_ner2docs, "ner": conll_ner2docs,
"iob": iob2docs, "iob": iob2docs,
"json": json2docs, "json": json2docs,
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
if output_type == "json": if output_type == "json":
srsly.write_json("-", docs_to_json(docs)) srsly.write_json("-", docs_to_json(docs))
else: else:
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes()) sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
def _write_docs_to_file(docs, output_file, output_type): def _write_docs_to_file(docs, output_file, output_type):
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
if output_type == "json": if output_type == "json":
srsly.write_json(output_file, docs_to_json(docs)) srsly.write_json(output_file, docs_to_json(docs))
else: else:
data = DocBin(docs=docs).to_bytes() data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)

View File

@ -37,7 +37,7 @@ def init_model_cli(
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
@ -56,6 +56,7 @@ def init_model_cli(
freqs_loc=freqs_loc, freqs_loc=freqs_loc,
clusters_loc=clusters_loc, clusters_loc=clusters_loc,
jsonl_loc=jsonl_loc, jsonl_loc=jsonl_loc,
vectors_loc=vectors_loc,
prune_vectors=prune_vectors, prune_vectors=prune_vectors,
truncate_vectors=truncate_vectors, truncate_vectors=truncate_vectors,
vectors_name=vectors_name, vectors_name=vectors_name,
@ -228,7 +229,7 @@ def add_vectors(
else: else:
if vectors_loc: if vectors_loc:
with msg.loading(f"Reading vectors from {vectors_loc}"): with msg.loading(f"Reading vectors from {vectors_loc}"):
vectors_data, vector_keys = read_vectors(msg, vectors_loc) vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
msg.good(f"Loaded vectors from {vectors_loc}") msg.good(f"Loaded vectors from {vectors_loc}")
else: else:
vectors_data, vector_keys = (None, None) vectors_data, vector_keys = (None, None)
@ -247,7 +248,7 @@ def add_vectors(
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0): def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
f = open_file(vectors_loc) f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1: if truncate_vectors >= 1:

View File

@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID, HEAD
from .. import util from .. import util
from ..gold import Example
@app.command("pretrain") @app.command("pretrain")
@ -183,7 +182,7 @@ def pretrain(
for batch_id, batch in enumerate(batches): for batch_id, batch in enumerate(batches):
docs, count = make_docs( docs, count = make_docs(
nlp, nlp,
[ex.doc for ex in batch], batch,
max_length=pretrain_config["max_length"], max_length=pretrain_config["max_length"],
min_length=pretrain_config["min_length"], min_length=pretrain_config["min_length"],
) )

View File

@ -159,6 +159,8 @@ class Warnings(object):
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".") "string \"Field1=Value1,Value2|Field2=Value3\".")
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
@add_codes @add_codes
@ -556,8 +558,8 @@ class Errors(object):
E979 = ("Cannot convert {type} to an Example object.") E979 = ("Cannot convert {type} to an Example object.")
E980 = ("Each link annotation should refer to a dictionary with at most one " E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.") "identifier mapping to 1.0, and all others to 0.0.")
E981 = ("The offsets of the annotations for 'links' need to refer exactly " E981 = ("The offsets of the annotations for 'links' could not be aligned "
"to the offsets of the 'entities' annotations.") "to token boundaries.")
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
"into {values}, but found {value}.") "into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: " E983 = ("Invalid key for '{dict}': {key}. Available keys: "
@ -593,7 +595,9 @@ class Errors(object):
E997 = ("Tokenizer special cases are not allowed to modify the text. " E997 = ("Tokenizer special cases are not allowed to modify the text. "
"This would map '{chunk}' to '{orth}' given token attributes " "This would map '{chunk}' to '{orth}' given token attributes "
"'{token_attrs}'.") "'{token_attrs}'.")
E999 = ("Unable to merge the `Doc` objects because they do not all share "
"the same `Vocab`.")
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):

View File

@ -1,6 +1,4 @@
from .iob2docs import iob2docs # noqa: F401 from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs from .json2docs import json2docs
from .conllu2docs import conllu2docs # noqa: F401
# TODO: Update this one
# from .conllu2docs import conllu2docs # noqa: F401

View File

@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
from ...gold import Example from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags from ...gold import iob_to_biluo, spans_from_biluo_tags
from ...language import Language from ...language import Language
from ...tokens import Doc, Token from ...tokens import Doc, Token, Span
from wasabi import Printer from wasabi import Printer
def conllu2json( def conllu2docs(
input_data, input_data,
n_sents=10, n_sents=10,
append_morphology=False, append_morphology=False,
@ -28,34 +28,22 @@ def conllu2json(
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
n_sents_info(msg, n_sents) n_sents_info(msg, n_sents)
docs = [] sent_docs = read_conllx(
raw = ""
sentences = []
conll_data = read_conllx(
input_data, input_data,
append_morphology=append_morphology, append_morphology=append_morphology,
ner_tag_pattern=MISC_NER_PATTERN, ner_tag_pattern=MISC_NER_PATTERN,
ner_map=ner_map, ner_map=ner_map,
merge_subtokens=merge_subtokens, merge_subtokens=merge_subtokens,
) )
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) docs = []
for i, example in enumerate(conll_data): sent_docs_to_merge = []
raw += example.text for sent_doc in sent_docs:
sentences.append( sent_docs_to_merge.append(sent_doc)
generate_sentence( if len(sent_docs_to_merge) % n_sents == 0:
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, docs.append(Doc.from_docs(sent_docs_to_merge))
) sent_docs_to_merge = []
) if sent_docs_to_merge:
# Real-sized documents could be extracted using the comments on the docs.append(Doc.from_docs(sent_docs_to_merge))
# conllu document
if len(sentences) % n_sents == 0:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
raw = ""
sentences = []
if sentences:
doc = create_json_doc(raw, sentences, i)
docs.append(doc)
return docs return docs
@ -84,14 +72,14 @@ def read_conllx(
ner_tag_pattern="", ner_tag_pattern="",
ner_map=None, ner_map=None,
): ):
""" Yield examples, one for each sentence """ """ Yield docs, one for each sentence """
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
for sent in input_data.strip().split("\n\n"): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split("\n") lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
example = example_from_conllu_sentence( doc = doc_from_conllu_sentence(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -99,7 +87,7 @@ def read_conllx(
append_morphology=append_morphology, append_morphology=append_morphology,
ner_map=ner_map, ner_map=ner_map,
) )
yield example yield doc
def get_entities(lines, tag_pattern, ner_map=None): def get_entities(lines, tag_pattern, ner_map=None):
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
return iob_to_biluo(iob) return iob_to_biluo(iob)
def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): def doc_from_conllu_sentence(
sentence = {}
tokens = []
token_annotation = example_dict["token_annotation"]
for i, id_ in enumerate(token_annotation["ids"]):
token = {}
token["id"] = id_
token["orth"] = token_annotation["words"][i]
token["tag"] = token_annotation["tags"][i]
token["pos"] = token_annotation["pos"][i]
token["lemma"] = token_annotation["lemmas"][i]
token["morph"] = token_annotation["morphs"][i]
token["head"] = token_annotation["heads"][i] - i
token["dep"] = token_annotation["deps"][i]
if has_ner_tags:
token["ner"] = example_dict["doc_annotation"]["entities"][i]
tokens.append(token)
sentence["tokens"] = tokens
return sentence
def create_json_doc(raw, sentences, id_):
doc = {}
paragraph = {}
doc["id"] = id_
doc["paragraphs"] = []
paragraph["raw"] = raw.strip()
paragraph["sentences"] = sentences
doc["paragraphs"].append(paragraph)
return doc
def example_from_conllu_sentence(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
if merge_subtokens: if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc) doc = merge_conllu_subtokens(lines, doc)
# create Example from custom Doc annotation # create final Doc from custom Doc annotation
words, spaces, tags, morphs, lemmas = [], [], [], [], [] words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
heads, deps = [], []
for i, t in enumerate(doc): for i, t in enumerate(doc):
words.append(t._.merged_orth) words.append(t._.merged_orth)
lemmas.append(t._.merged_lemma) lemmas.append(t._.merged_lemma)
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
tags.append(t.tag_ + "__" + t._.merged_morph) tags.append(t.tag_ + "__" + t._.merged_morph)
else: else:
tags.append(t.tag_) tags.append(t.tag_)
poses.append(t.pos_)
heads.append(t.head.i)
deps.append(t.dep_)
doc_x = Doc(vocab, words=words, spaces=spaces) doc_x = Doc(vocab, words=words, spaces=spaces)
ref_dict = Example(doc_x, reference=doc).to_dict() for i in range(len(doc)):
ref_dict["words"] = words doc_x[i].tag_ = tags[i]
ref_dict["lemmas"] = lemmas doc_x[i].morph_ = morphs[i]
ref_dict["spaces"] = spaces doc_x[i].lemma_ = lemmas[i]
ref_dict["tags"] = tags doc_x[i].pos_ = poses[i]
ref_dict["morphs"] = morphs doc_x[i].dep_ = deps[i]
example = Example.from_dict(doc_x, ref_dict) doc_x[i].head = doc_x[heads[i]]
return example doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
doc_x.is_parsed = True
doc_x.is_tagged = True
return doc_x
def merge_conllu_subtokens(lines, doc): def merge_conllu_subtokens(lines, doc):

View File

@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
for json_para in json_to_annotations(json_doc): for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para) example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if json_para.get("raw"):
assert tok_dict.get("SPACY")
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc) docs.append(doc)
return docs return docs

View File

@ -43,25 +43,36 @@ class Corpus:
locs.append(path) locs.append(path)
return locs return locs
def _make_example(self, nlp, reference, gold_preproc):
if gold_preproc or reference.has_unknown_spaces:
return Example(
Doc(
nlp.vocab,
words=[word.text for word in reference],
spaces=[bool(word.whitespace_) for word in reference]
),
reference
)
else:
return Example(
nlp.make_doc(reference.text),
reference
)
def make_examples(self, nlp, reference_docs, max_length=0): def make_examples(self, nlp, reference_docs, max_length=0):
for reference in reference_docs: for reference in reference_docs:
if len(reference) == 0: if len(reference) == 0:
continue continue
elif max_length == 0 or len(reference) < max_length: elif max_length == 0 or len(reference) < max_length:
yield Example( yield self._make_example(nlp, reference, False)
nlp.make_doc(reference.text),
reference
)
elif reference.is_sentenced: elif reference.is_sentenced:
for ref_sent in reference.sents: for ref_sent in reference.sents:
if len(ref_sent) == 0: if len(ref_sent) == 0:
continue continue
elif max_length == 0 or len(ref_sent) < max_length: elif max_length == 0 or len(ref_sent) < max_length:
yield Example( yield self._make_example(nlp, ref_sent.as_doc(), False)
nlp.make_doc(ref_sent.text),
ref_sent.as_doc()
)
def make_examples_gold_preproc(self, nlp, reference_docs): def make_examples_gold_preproc(self, nlp, reference_docs):
for reference in reference_docs: for reference in reference_docs:
if reference.is_sentenced: if reference.is_sentenced:
@ -69,14 +80,7 @@ class Corpus:
else: else:
ref_sents = [reference] ref_sents = [reference]
for ref_sent in ref_sents: for ref_sent in ref_sents:
eg = Example( eg = self._make_example(nlp, ref_sent, True)
Doc(
nlp.vocab,
words=[w.text for w in ref_sent],
spaces=[bool(w.whitespace_) for w in ref_sent]
),
ref_sent
)
if len(eg.x): if len(eg.x):
yield eg yield eg

View File

@ -15,7 +15,7 @@ from ..syntax import nonproj
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """ """ Create a Doc from dictionaries with token and doc annotations. """
attrs, array = _annot2array(vocab, tok_annot, doc_annot) attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if "entities" in doc_annot: if "entities" in doc_annot:
@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key == "entities": if key == "entities":
pass pass
elif key == "links": elif key == "links":
entities = doc_annot.get("entities", {}) ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
if not entities:
raise ValueError(Errors.E981)
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
tok_annot["ENT_KB_ID"] = ent_kb_ids tok_annot["ENT_KB_ID"] = ent_kb_ids
elif key == "cats": elif key == "cats":
pass pass
@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("") ent_types.append("")
return ent_iobs, ent_types return ent_iobs, ent_types
def _parse_links(vocab, words, links, entities): def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words) reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference} starts = {token.idx: token.i for token in reference}
ends = {token.idx + len(token): token.i for token in reference} ends = {token.idx + len(token): token.i for token in reference}
ent_kb_ids = ["" for _ in reference] ent_kb_ids = ["" for _ in reference]
entity_map = [(ent[0], ent[1]) for ent in entities]
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
for index, annot_dict in links.items():
start_char, end_char = index
if (start_char, end_char) not in entity_map:
raise ValueError(Errors.E981)
for index, annot_dict in links.items(): for index, annot_dict in links.items():
true_kb_ids = [] true_kb_ids = []
@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities):
start_char, end_char = index start_char, end_char = index
start_token = starts.get(start_char) start_token = starts.get(start_char)
end_token = ends.get(end_char) end_token = ends.get(end_char)
if start_token is None or end_token is None:
raise ValueError(Errors.E981)
for i in range(start_token, end_token+1): for i in range(start_token, end_token+1):
ent_kb_ids[i] = true_kb_ids[0] ent_kb_ids[i] = true_kb_ids[0]
@ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities):
def _guess_spaces(text, words): def _guess_spaces(text, words):
if text is None: if text is None:
return [True] * len(words) return None
spaces = [] spaces = []
text_pos = 0 text_pos = 0
# align words with text # align words with text

View File

@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
assert new_doc.is_parsed assert new_doc.is_parsed
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "They don't think alike."]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
docs_idx = en_texts[0].index('docs')
de_doc = de_tokenizer(de_text)
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
assert Doc.from_docs([]) is None
assert de_doc is not Doc.from_docs([de_doc])
assert str(de_doc) == str(Doc.from_docs([de_doc]))
with pytest.raises(ValueError):
Doc.from_docs(en_docs + [de_doc])
m_doc = Doc.from_docs(en_docs)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts)
p_token = m_doc[len(en_docs[0])-1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
with pytest.raises(AttributeError):
not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_docs) == len(list(m_doc.sents))
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == "".join(en_texts)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing
assert list(m_doc.sents)
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
assert len(m_doc) == len(en_docs_tokens)
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
assert m_doc[9].idx == think_idx
def test_doc_lang(en_vocab): def test_doc_lang(en_vocab):
doc = Doc(en_vocab, words=["Hello", "world"]) doc = Doc(en_vocab, words=["Hello", "world"])
assert doc.lang_ == "en" assert doc.lang_ == "en"

View File

@ -75,3 +75,19 @@ def test_serialize_doc_bin():
for i, doc in enumerate(reloaded_docs): for i, doc in enumerate(reloaded_docs):
assert doc.text == texts[i] assert doc.text == texts[i]
assert doc.cats == cats assert doc.cats == cats
def test_serialize_doc_bin_unknown_spaces(en_vocab):
doc1 = Doc(en_vocab, words=["that", "'s"])
assert doc1.has_unknown_spaces
assert doc1.text == "that 's "
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
assert not doc2.has_unknown_spaces
assert doc2.text == "that's"
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
assert re_doc1.has_unknown_spaces
assert re_doc1.text == "that 's "
assert not re_doc2.has_unknown_spaces
assert re_doc2.text == "that's"

View File

@ -1,14 +1,10 @@
import pytest import pytest
from spacy.gold import docs_to_json from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.gold.converters.conllu2json import conllu2json
from spacy.lang.en import English from spacy.lang.en import English
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
# TODO
# from spacy.gold.converters import conllu2docs
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1) converted_docs = conllu2docs(input_data, n_sents=1)
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["head"] for t in tokens] == [1, 2, -1, 0]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
) )
def test_cli_converters_conllu2json_name_ner_map(lines): def test_cli_converters_conllu2json_name_ner_map(lines):
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0] sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 5 assert len(sent["tokens"]) == 5
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens(): def test_cli_converters_conllu2json_subtokens():
@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted = conllu2json( converted_docs = conllu2docs(
input_data, n_sents=1, merge_subtokens=True, append_morphology=True input_data, n_sents=1, merge_subtokens=True, append_morphology=True
) )
assert len(converted) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"]) == 1
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
sent = converted[0]["paragraphs"][0]["sentences"][0] sent = converted[0]["paragraphs"][0]["sentences"][0]
assert len(sent["tokens"]) == 4 assert len(sent["tokens"]) == 4
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["head"] for t in tokens] == [1, 1, 0, -1]
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json(en_vocab): def test_cli_converters_iob2json(en_vocab):

View File

@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
[ [
{ {
"words": ["I", "like", "New", "York", "and", "Berlin", "."], "words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")], "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
} }
], ],
) )

View File

@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
class DocBin(object): class DocBin(object):
@ -31,6 +31,7 @@ class DocBin(object):
"spaces": bytes, # Serialized numpy boolean array with spaces data "spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths "lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data "strings": List[unicode] # List of unique strings in the token data
"version": str, # DocBin version number
} }
Strings for the words, tags, labels etc are represented by 64-bit hashes in Strings for the words, tags, labels etc are represented by 64-bit hashes in
@ -53,12 +54,14 @@ class DocBin(object):
DOCS: https://spacy.io/api/docbin#init DOCS: https://spacy.io/api/docbin#init
""" """
attrs = sorted([intify_attr(attr) for attr in attrs]) attrs = sorted([intify_attr(attr) for attr in attrs])
self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
self.tokens = [] self.tokens = []
self.spaces = [] self.spaces = []
self.cats = [] self.cats = []
self.user_data = [] self.user_data = []
self.flags = []
self.strings = set() self.strings = set()
self.store_user_data = store_user_data self.store_user_data = store_user_data
for doc in docs: for doc in docs:
@ -83,12 +86,17 @@ class DocBin(object):
assert array.shape[0] == spaces.shape[0] # this should never happen assert array.shape[0] == spaces.shape[0] # this should never happen
spaces = spaces.reshape((spaces.shape[0], 1)) spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.flags.append({
"has_unknown_spaces": doc.has_unknown_spaces
})
for token in doc: for token in doc:
self.strings.add(token.text) self.strings.add(token.text)
self.strings.add(token.tag_) self.strings.add(token.tag_)
self.strings.add(token.lemma_) self.strings.add(token.lemma_)
self.strings.add(token.morph_)
self.strings.add(token.dep_) self.strings.add(token.dep_)
self.strings.add(token.ent_type_) self.strings.add(token.ent_type_)
self.strings.add(token.ent_kb_id_)
self.cats.append(doc.cats) self.cats.append(doc.cats)
if self.store_user_data: if self.store_user_data:
self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@ -105,8 +113,11 @@ class DocBin(object):
vocab[string] vocab[string]
orth_col = self.attrs.index(ORTH) orth_col = self.attrs.index(ORTH)
for i in range(len(self.tokens)): for i in range(len(self.tokens)):
flags = self.flags[i]
tokens = self.tokens[i] tokens = self.tokens[i]
spaces = self.spaces[i] spaces = self.spaces[i]
if flags.get("has_unknown_spaces"):
spaces = None
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
doc = doc.from_array(self.attrs, tokens) doc = doc.from_array(self.attrs, tokens)
doc.cats = self.cats[i] doc.cats = self.cats[i]
@ -130,6 +141,7 @@ class DocBin(object):
self.spaces.extend(other.spaces) self.spaces.extend(other.spaces)
self.strings.update(other.strings) self.strings.update(other.strings)
self.cats.extend(other.cats) self.cats.extend(other.cats)
self.flags.extend(other.flags)
if self.store_user_data: if self.store_user_data:
self.user_data.extend(other.user_data) self.user_data.extend(other.user_data)
@ -147,12 +159,14 @@ class DocBin(object):
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = { msg = {
"version": self.version,
"attrs": self.attrs, "attrs": self.attrs,
"tokens": tokens.tobytes("C"), "tokens": tokens.tobytes("C"),
"spaces": spaces.tobytes("C"), "spaces": spaces.tobytes("C"),
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings), "strings": list(self.strings),
"cats": self.cats, "cats": self.cats,
"flags": self.flags,
} }
if self.store_user_data: if self.store_user_data:
msg["user_data"] = self.user_data msg["user_data"] = self.user_data
@ -178,6 +192,7 @@ class DocBin(object):
self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
self.spaces = NumpyOps().unflatten(flat_spaces, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
self.cats = msg["cats"] self.cats = msg["cats"]
self.flags = msg.get("flags", [{} for _ in lengths])
if self.store_user_data and "user_data" in msg: if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"]) self.user_data = list(msg["user_data"])
for tokens in self.tokens: for tokens in self.tokens:

View File

@ -59,11 +59,14 @@ cdef class Doc:
cdef public dict user_token_hooks cdef public dict user_token_hooks
cdef public dict user_span_hooks cdef public dict user_span_hooks
cdef public bint has_unknown_spaces
cdef public list _py_tokens cdef public list _py_tokens
cdef int length cdef int length
cdef int max_length cdef int max_length
cdef public object noun_chunks_iterator cdef public object noun_chunks_iterator
cdef object __weakref__ cdef object __weakref__

View File

@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
from libc.math cimport sqrt from libc.math cimport sqrt
from libc.stdint cimport int32_t, uint64_t from libc.stdint cimport int32_t, uint64_t
import copy
from collections import Counter from collections import Counter
import numpy import numpy
import numpy.linalg import numpy.linalg
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..attrs import intify_attrs, IDS from ..attrs import intify_attr, intify_attrs, IDS
from ..util import normalize_slice from ..util import normalize_slice
from ..compat import copy_reg, pickle from ..compat import copy_reg, pickle
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
@ -171,8 +172,7 @@ cdef class Doc:
raise ValueError(Errors.E046.format(name=name)) raise ValueError(Errors.E046.format(name=name))
return Underscore.doc_extensions.pop(name) return Underscore.doc_extensions.pop(name)
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None, def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
orths_and_spaces=None):
"""Create a Doc object. """Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you vocab (Vocab): A vocabulary object, which must match any models you
@ -214,28 +214,25 @@ cdef class Doc:
self._vector = None self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang) self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef bint has_space cdef bint has_space
if orths_and_spaces is None and words is not None: if words is None and spaces is not None:
if spaces is None: raise ValueError("words must be set if spaces is set")
spaces = [True] * len(words) elif spaces is None and words is not None:
elif len(spaces) != len(words): self.has_unknown_spaces = True
raise ValueError(Errors.E027) else:
orths_and_spaces = zip(words, spaces) self.has_unknown_spaces = False
words = words if words is not None else []
spaces = spaces if spaces is not None else ([True] * len(words))
if len(spaces) != len(words):
raise ValueError(Errors.E027)
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
if orths_and_spaces is not None: for word, has_space in zip(words, spaces):
orths_and_spaces = list(orths_and_spaces) if isinstance(word, unicode):
for orth_space in orths_and_spaces: lexeme = self.vocab.get(self.mem, word)
if isinstance(orth_space, unicode): elif isinstance(word, bytes):
lexeme = self.vocab.get(self.mem, orth_space) raise ValueError(Errors.E028.format(value=word))
has_space = True else:
elif isinstance(orth_space, bytes): lexeme = self.vocab.get_by_orth(self.mem, word)
raise ValueError(Errors.E028.format(value=orth_space)) self.push_back(lexeme, has_space)
elif isinstance(orth_space[0], unicode):
lexeme = self.vocab.get(self.mem, orth_space[0])
has_space = orth_space[1]
else:
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
has_space = orth_space[1]
self.push_back(lexeme, has_space)
# Tough to decide on policy for this. Is an empty doc tagged and parsed? # Tough to decide on policy for this. Is an empty doc tagged and parsed?
# There's no information we'd like to add to it, so I guess so? # There's no information we'd like to add to it, so I guess so?
if self.length == 0: if self.length == 0:
@ -806,7 +803,7 @@ cdef class Doc:
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
for id_ in attrs] for id_ in attrs]
if array.dtype != numpy.uint64: if array.dtype != numpy.uint64:
warnings.warn(Warnings.W028.format(type=array.dtype)) warnings.warn(Warnings.W101.format(type=array.dtype))
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError(Errors.E032) raise ValueError(Errors.E032)
@ -882,6 +879,87 @@ cdef class Doc:
set_children_from_heads(self.c, length) set_children_from_heads(self.c, length)
return self return self
@staticmethod
def from_docs(docs, ensure_whitespace=True, attrs=None):
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
the same `Vocab`.
docs (list): A list of Doc objects.
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
attrs (list): Optional list of attribute ID ints or attribute name strings.
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
DOCS: https://spacy.io/api/doc#from_docs
"""
if not docs:
return None
vocab = {doc.vocab for doc in docs}
if len(vocab) > 1:
raise ValueError(Errors.E999)
(vocab,) = vocab
if attrs is None:
attrs = [LEMMA, NORM]
if all(doc.is_nered for doc in docs):
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
# TODO: separate for is_morphed?
if all(doc.is_tagged for doc in docs):
attrs.extend([TAG, POS, MORPH])
if all(doc.is_parsed for doc in docs):
attrs.extend([HEAD, DEP])
else:
attrs.append(SENT_START)
else:
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present
if SPACY not in attrs:
attrs.append(SPACY)
concat_words = []
concat_spaces = []
concat_user_data = {}
char_offset = 0
for doc in docs:
concat_words.extend(t.text for t in doc)
concat_spaces.extend(bool(t.whitespace_) for t in doc)
for key, value in doc.user_data.items():
if isinstance(key, tuple) and len(key) == 4:
data_type, name, start, end = key
if start is not None or end is not None:
start += char_offset
if end is not None:
end += char_offset
concat_user_data[(data_type, name, start, end)] = copy.copy(value)
else:
warnings.warn(Warnings.W101.format(name=name))
else:
warnings.warn(Warnings.W102.format(key=key, value=value))
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
arrays = [doc.to_array(attrs) for doc in docs]
if ensure_whitespace:
spacy_index = attrs.index(SPACY)
for i, array in enumerate(arrays[:-1]):
if len(array) > 0 and not docs[i][-1].is_space:
array[-1][spacy_index] = 1
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
if not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)
concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
concat_doc.from_array(attrs, concat_array)
return concat_doc
def get_lca_matrix(self): def get_lca_matrix(self):
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
@ -1000,6 +1078,7 @@ cdef class Doc:
"sentiment": lambda: self.sentiment, "sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor, "tensor": lambda: self.tensor,
"cats": lambda: self.cats, "cats": lambda: self.cats,
"has_unknown_spaces": lambda: self.has_unknown_spaces
} }
for key in kwargs: for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@ -1032,6 +1111,7 @@ cdef class Doc:
"cats": lambda b: None, "cats": lambda b: None,
"user_data_keys": lambda b: None, "user_data_keys": lambda b: None,
"user_data_values": lambda b: None, "user_data_values": lambda b: None,
"has_unknown_spaces": lambda b: None
} }
for key in kwargs: for key in kwargs:
if key in deserializers or key in ("user_data",): if key in deserializers or key in ("user_data",):
@ -1052,6 +1132,8 @@ cdef class Doc:
self.tensor = msg["tensor"] self.tensor = msg["tensor"]
if "cats" not in exclude and "cats" in msg: if "cats" not in exclude and "cats" in msg:
self.cats = msg["cats"] self.cats = msg["cats"]
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
self.has_unknown_spaces = msg["has_unknown_spaces"]
start = 0 start = 0
cdef const LexemeC* lex cdef const LexemeC* lex
cdef unicode orth_ cdef unicode orth_
@ -1123,50 +1205,6 @@ cdef class Doc:
remove_label_if_necessary(attributes[i]) remove_label_if_necessary(attributes[i])
retokenizer.merge(span, attributes[i]) retokenizer.merge(span, attributes[i])
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
# TODO: ENT_KB_ID ?
if len(args) == 3:
warnings.warn(Warnings.W003, DeprecationWarning)
tag, lemma, ent_type = args
attributes[TAG] = tag
attributes[LEMMA] = lemma
attributes[ENT_TYPE] = ent_type
elif not args:
fix_attributes(self, attributes)
elif args:
raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
kwargs=repr(attributes)))
remove_label_if_necessary(attributes)
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
cdef int end = token_by_end(self.c, self.length, end_idx)
if end == -1:
return None
# Currently we have the token index, we want the range-end index
end += 1
with self.retokenize() as retokenizer:
retokenizer.merge(self[start:end], attrs=attributes)
return self[start]
def print_tree(self, light=False, flat=False):
raise ValueError(Errors.E105)
def to_json(self, underscore=None): def to_json(self, underscore=None):
"""Convert a Doc to JSON. The format it produces will be the new format """Convert a Doc to JSON. The format it produces will be the new format
for the `spacy train` command (not implemented yet). for the `spacy train` command (not implemented yet).

View File

@ -280,18 +280,6 @@ cdef class Span:
return array return array
def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single
token.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)
def get_lca_matrix(self): def get_lca_matrix(self):
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
`Span`, where LCA[i, j] is the index of the lowest common ancestor among `Span`, where LCA[i, j] is the index of the lowest common ancestor among

View File

@ -349,6 +349,33 @@ array of attributes.
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | Itself. | | **RETURNS** | `Doc` | Itself. |
## Doc.from_docs {#from_docs tag="staticmethod"}
Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
> #### Example
>
> ```python
> from spacy.tokens import Doc
> texts = ["London is the capital of the United Kingdom.",
> "The River Thames flows through London.",
> "The famous Tower Bridge crosses the River Thames."]
> docs = list(nlp.pipe(texts))
> c_doc = Doc.from_docs(docs)
> assert str(c_doc) == " ".join(texts)
> assert len(list(c_doc.sents)) == len(docs)
> assert [str(ent) for ent in c_doc.ents] == \
> [str(ent) for doc in docs for ent in doc.ents]
> ```
| Name | Type | Description |
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
| `docs` | list | A list of `Doc` objects. |
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. |
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. |
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
## Doc.to_disk {#to_disk tag="method" new="2"} ## Doc.to_disk {#to_disk tag="method" new="2"}
Save the current state to a directory. Save the current state to a directory.

View File

@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure: the msgpack object has the following structure:
```python ```python
### msgpack object strcutrue ### msgpack object structrue
{ {
"version": str, # DocBin version number
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data "tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data "spaces": bytes, # Serialized numpy boolean array with spaces data
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
| Argument | Type | Description | | Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. | | **RETURNS** | `DocBin` | The newly constructed object. |

View File

@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
## Matcher.\_\_call\_\_ {#call tag="method"} ## Matcher.\_\_call\_\_ {#call tag="method"}
Find all token sequences matching the supplied patterns on the `Doc`. As of Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
spaCy v2.3, the `Matcher` can also be called on `Span` objects.
> #### Example > #### Example
> >
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
> >
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern) > matcher.add("HelloWorld", [pattern])
> doc = nlp("hello world!") > doc = nlp("hello world!")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). | | `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
<Infobox title="Important note" variant="warning">
By default, the matcher **does not perform any action** on matches, like tagging
matched phrases with entity types. Instead, actions need to be specified when
**adding patterns or entities**, by passing in a callback function as the
`on_match` argument on [`add`](/api/matcher#add). This allows you to define
custom actions per pattern within the same matcher. For example, you might only
want to merge some entity types, and set custom flags for other matched
patterns. For more details and examples, see the usage guide on
[rule-based matching](/usage/rule-based-matching).
</Infobox>
## Matcher.pipe {#pipe tag="method"} ## Matcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn. Match a stream of documents, yielding them in turn.
@ -92,7 +78,7 @@ patterns.
> ```python > ```python
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> assert len(matcher) == 0 > assert len(matcher) == 0
> matcher.add("Rule", None, [{"ORTH": "test"}]) > matcher.add("Rule", [[{"ORTH": "test"}]])
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
> >
> ```python > ```python
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> assert 'Rule' not in matcher > assert "Rule" not in matcher
> matcher.add('Rule', None, [{'ORTH': 'test'}]) > matcher.add("Rule", [[{'ORTH': 'test'}]])
> assert 'Rule' in matcher > assert "Rule" in matcher
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -129,39 +115,39 @@ overwritten.
> #### Example > #### Example
> >
> ```python > ```python
> def on_match(matcher, doc, id, matches): > def on_match(matcher, doc, id, matches):
> print('Matched!', matches) > print('Matched!', matches)
> >
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) > patterns = [
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) > [{"LOWER": "hello"}, {"LOWER": "world"}],
> doc = nlp("HELLO WORLD on Google Maps.") > [{"ORTH": "Google"}, {"ORTH": "Maps"}]
> matches = matcher(doc) > ]
> matcher.add("TEST_PATTERNS", patterns)
> doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc)
> ``` > ```
| Name | Type | Description | <Infobox title="Changed in v3.0" variant="warning">
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
<Infobox title="Changed in v2.2.2" variant="warning"> As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
the default in the future. The patterns are now the second argument and a list
(instead of a variable number of arguments). The `on_match` callback becomes an (instead of a variable number of arguments). The `on_match` callback becomes an
optional keyword argument. optional keyword argument.
```diff ```diff
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
- matcher.add("GoogleNow", None, *patterns)
+ matcher.add("GoogleNow", patterns)
- matcher.add("GoogleNow", on_match, *patterns) - matcher.add("GoogleNow", on_match, *patterns)
+ matcher.add("GoogleNow", patterns, on_match=on_match) + matcher.add("GoogleNow", patterns, on_match=on_match)
``` ```
</Infobox> </Infobox>
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## Matcher.remove {#remove tag="method" new="2"} ## Matcher.remove {#remove tag="method" new="2"}
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@ -170,7 +156,7 @@ exist.
> #### Example > #### Example
> >
> ```python > ```python
> matcher.add("Rule", None, [{"ORTH": "test"}]) > matcher.add("Rule", [[{"ORTH": "test"}]])
> assert "Rule" in matcher > assert "Rule" in matcher
> matcher.remove("Rule") > matcher.remove("Rule")
> assert "Rule" not in matcher > assert "Rule" not in matcher
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> #### Example > #### Example
> >
> ```python > ```python
> matcher.add("Rule", None, [{"ORTH": "test"}]) > matcher.add("Rule", [[{"ORTH": "test"}]])
> on_match, patterns = matcher.get("Rule") > on_match, patterns = matcher.get("Rule")
> ``` > ```

View File

@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher > from spacy.matcher import PhraseMatcher
> >
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp("Barack Obama")) > matcher.add("OBAMA", [nlp("Barack Obama")])
> doc = nlp("Barack Obama lifts America one last time in emotional farewell") > doc = nlp("Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
@ -104,7 +104,7 @@ patterns.
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0 > assert len(matcher) == 0
> matcher.add("OBAMA", None, nlp("Barack Obama")) > matcher.add("OBAMA", [nlp("Barack Obama")])
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher > assert "OBAMA" not in matcher
> matcher.add("OBAMA", None, nlp("Barack Obama")) > matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher > assert "OBAMA" in matcher
> ``` > ```
@ -145,36 +145,32 @@ overwritten.
> print('Matched!', matches) > print('Matched!', matches)
> >
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", on_match, nlp("Barack Obama")) > matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
> matcher.add("HEALTH", on_match, nlp("health care reform"), > matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
> nlp("healthcare reform"))
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") > doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Type | Description | <Infobox title="Changed in v3.0" variant="warning">
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
<Infobox title="Changed in v2.2.2" variant="warning"> As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
argument (instead of a variable number of arguments). The `on_match` callback
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
become the default in the future. The `Doc` patterns are now the second argument
and a list (instead of a variable number of arguments). The `on_match` callback
becomes an optional keyword argument. becomes an optional keyword argument.
```diff ```diff
patterns = [nlp("health care reform"), nlp("healthcare reform")] patterns = [nlp("health care reform"), nlp("healthcare reform")]
- matcher.add("HEALTH", None, *patterns)
+ matcher.add("HEALTH", patterns)
- matcher.add("HEALTH", on_match, *patterns) - matcher.add("HEALTH", on_match, *patterns)
+ matcher.add("HEALTH", patterns, on_match=on_match) + matcher.add("HEALTH", patterns, on_match=on_match)
``` ```
</Infobox> </Infobox>
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `docs` | list | `Doc` objects of the phrases to match. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## PhraseMatcher.remove {#remove tag="method" new="2.2"} ## PhraseMatcher.remove {#remove tag="method" new="2.2"}
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@ -184,7 +180,7 @@ does not exist.
> >
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp("Barack Obama")) > matcher.add("OBAMA", [nlp("Barack Obama")])
> assert "OBAMA" in matcher > assert "OBAMA" in matcher
> matcher.remove("OBAMA") > matcher.remove("OBAMA")
> assert "OBAMA" not in matcher > assert "OBAMA" not in matcher

View File

@ -407,7 +407,7 @@ class EntityMatcher(object):
def __init__(self, nlp, terms, label): def __init__(self, nlp, terms, label):
patterns = [nlp.make_doc(text) for text in terms] patterns = [nlp.make_doc(text) for text in terms]
self.matcher = PhraseMatcher(nlp.vocab) self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add(label, None, *patterns) self.matcher.add(label, patterns)
def __call__(self, doc): def __call__(self, doc):
matches = self.matcher(doc) matches = self.matcher(doc)

View File

@ -98,9 +98,7 @@ print([token.text for token in doc])
First, we initialize the `Matcher` with a vocab. The matcher must always share First, we initialize the `Matcher` with a vocab. The matcher must always share
the same vocab with the documents it will operate on. We can now call the same vocab with the documents it will operate on. We can now call
[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The [`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
second argument lets you pass in an optional callback function to invoke on a
successful match. For now, we set it to `None`.
```python ```python
### {executable="true"} ### {executable="true"}
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern # Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern) matcher.add("HelloWorld", [pattern])
doc = nlp("Hello, world! Hello world!") doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) matches = matcher(doc)
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
also match sequences without punctuation between "hello" and "world": also match sequences without punctuation between "hello" and "world":
```python ```python
matcher.add("HelloWorld", None, patterns = [
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
[{"LOWER": "hello"}, {"LOWER": "world"}]) [{"LOWER": "hello"}, {"LOWER": "world"}]
]
matcher.add("HelloWorld", patterns)
``` ```
By default, the matcher will only return the matches and **not do anything By default, the matcher will only return the matches and **not do anything
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab, validate=True) matcher = Matcher(nlp.vocab, validate=True)
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
matcher.add("HelloWorld", None, pattern) matcher.add("HelloWorld", [pattern])
# 🚨 Raises an error: # 🚨 Raises an error:
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
# Pattern 0: # Pattern 0:
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
print(entity.text) print(entity.text)
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern) matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
doc = nlp("This is a text about Google I/O") doc = nlp("This is a text about Google I/O")
matches = matcher(doc) matches = matcher(doc)
``` ```
@ -509,19 +509,18 @@ import spacy
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.tokens import Token from spacy.tokens import Token
# We're using a class because the component needs to be initialised with # We're using a class because the component needs to be initialized with
# the shared vocab via the nlp object # the shared vocab via the nlp object
class BadHTMLMerger(object): class BadHTMLMerger(object):
def __init__(self, nlp): def __init__(self, nlp):
patterns = [
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
]
# Register a new token extension to flag bad HTML # Register a new token extension to flag bad HTML
Token.set_extension("bad_html", default=False) Token.set_extension("bad_html", default=False)
self.matcher = Matcher(nlp.vocab) self.matcher = Matcher(nlp.vocab)
self.matcher.add( self.matcher.add("BAD_HTML", patterns)
"BAD_HTML",
None,
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
)
def __call__(self, doc): def __call__(self, doc):
# This method is invoked when the component is called on a Doc # This method is invoked when the component is called on a Doc
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}] {"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern) # add pattern matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern
doc = nlp("I'd say that Facebook is evil. Facebook is pretty cool, right?") doc = nlp("I'd say that Facebook is evil. Facebook is pretty cool, right?")
matches = matcher(doc) matches = matcher(doc)
@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern) matcher.add("PHONE_NUMBER", [pattern])
doc = nlp("Call me at (123) 456 789 or (123) 456 789!") doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc]) print([t.text for t in doc])
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
elif doc.vocab.strings[match_id] == "SAD": elif doc.vocab.strings[match_id] == "SAD":
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
doc = nlp("Hello world 😀 #MondayMotivation") doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc) matches = matcher(doc)
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up # Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms] patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns) matcher.add("TerminologyList", patterns)
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
"converse in the Oval Office inside the White House in Washington, D.C.") "converse in the Oval Office inside the White House in Washington, D.C.")
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER") matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", None, *patterns) matcher.add("Names", patterns)
doc = nlp("angela merkel and us president barack Obama") doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc): for match_id, start, end in matcher(doc):
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc): for match_id, start, end in matcher(doc):

View File

@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
def set_sentiment(matcher, doc, i, matches): def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1 doc.sentiment += 0.1
pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji
doc = nlp("A text about Google I/O 😀😀") doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc) matches = matcher(doc)