diff --git a/spacy/about.py b/spacy/about.py index 818dd1286..5b2a89c61 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a0" +__version__ = "3.0.0a1" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index b008e2f93..56f38766a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -9,7 +9,7 @@ import sys from ._app import app, Arg, Opt from ..gold import docs_to_json from ..tokens import DocBin -from ..gold.converters import iob2docs, conll_ner2docs, json2docs +from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs # Converters are matched by file extension except for ner/iob, which are @@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs # imported from /converters. CONVERTERS = { - # "conllubio": conllu2docs, TODO - # "conllu": conllu2docs, TODO - # "conll": conllu2docs, TODO + "conllubio": conllu2docs, + "conllu": conllu2docs, + "conll": conllu2docs, "ner": conll_ner2docs, "iob": iob2docs, "json": json2docs, @@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type): if output_type == "json": srsly.write_json("-", docs_to_json(docs)) else: - sys.stdout.buffer.write(DocBin(docs=docs).to_bytes()) + sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) def _write_docs_to_file(docs, output_file, output_type): @@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type): if output_type == "json": srsly.write_json(output_file, docs_to_json(docs)) else: - data = DocBin(docs=docs).to_bytes() + data = DocBin(docs=docs, store_user_data=True).to_bytes() with output_file.open("wb") as file_: file_.write(data) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 37f862ef2..d0d876aed 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -37,7 +37,7 @@ def init_model_cli( clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), - prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), + prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"), truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), @@ -56,6 +56,7 @@ def init_model_cli( freqs_loc=freqs_loc, clusters_loc=clusters_loc, jsonl_loc=jsonl_loc, + vectors_loc=vectors_loc, prune_vectors=prune_vectors, truncate_vectors=truncate_vectors, vectors_name=vectors_name, @@ -228,7 +229,7 @@ def add_vectors( else: if vectors_loc: with msg.loading(f"Reading vectors from {vectors_loc}"): - vectors_data, vector_keys = read_vectors(msg, vectors_loc) + vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors) msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) @@ -247,7 +248,7 @@ def add_vectors( nlp.vocab.prune_vectors(prune_vectors) -def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0): +def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 18c429c60..d0684dcff 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model from ..tokens import Doc from ..attrs import ID, HEAD from .. import util -from ..gold import Example @app.command("pretrain") @@ -183,7 +182,7 @@ def pretrain( for batch_id, batch in enumerate(batches): docs, count = make_docs( nlp, - [ex.doc for ex in batch], + batch, max_length=pretrain_config["max_length"], min_length=pretrain_config["min_length"], ) diff --git a/spacy/errors.py b/spacy/errors.py index 66a3c61da..07cf7bbdf 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -159,6 +159,8 @@ class Warnings(object): W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "string \"Field1=Value1,Value2|Field2=Value3\".") + W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.") + W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.") @add_codes @@ -556,8 +558,8 @@ class Errors(object): E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") - E981 = ("The offsets of the annotations for 'links' need to refer exactly " - "to the offsets of the 'entities' annotations.") + E981 = ("The offsets of the annotations for 'links' could not be aligned " + "to token boundaries.") E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " "into {values}, but found {value}.") E983 = ("Invalid key for '{dict}': {key}. Available keys: " @@ -593,7 +595,9 @@ class Errors(object): E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " "'{token_attrs}'.") - + E999 = ("Unable to merge the `Doc` objects because they do not all share " + "the same `Vocab`.") + @add_codes class TempErrors(object): diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index 3e366933a..63d52ad9d 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,4 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 from .json2docs import json2docs - -# TODO: Update this one -# from .conllu2docs import conllu2docs # noqa: F401 +from .conllu2docs import conllu2docs # noqa: F401 diff --git a/spacy/gold/converters/conllu2json.py b/spacy/gold/converters/conllu2docs.py similarity index 79% rename from spacy/gold/converters/conllu2json.py rename to spacy/gold/converters/conllu2docs.py index 73fdf57e7..b591d3218 100644 --- a/spacy/gold/converters/conllu2json.py +++ b/spacy/gold/converters/conllu2docs.py @@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info from ...gold import Example from ...gold import iob_to_biluo, spans_from_biluo_tags from ...language import Language -from ...tokens import Doc, Token +from ...tokens import Doc, Token, Span from wasabi import Printer -def conllu2json( +def conllu2docs( input_data, n_sents=10, append_morphology=False, @@ -28,34 +28,22 @@ def conllu2json( MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" msg = Printer(no_print=no_print) n_sents_info(msg, n_sents) - docs = [] - raw = "" - sentences = [] - conll_data = read_conllx( + sent_docs = read_conllx( input_data, append_morphology=append_morphology, ner_tag_pattern=MISC_NER_PATTERN, ner_map=ner_map, merge_subtokens=merge_subtokens, ) - has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) - for i, example in enumerate(conll_data): - raw += example.text - sentences.append( - generate_sentence( - example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, - ) - ) - # Real-sized documents could be extracted using the comments on the - # conllu document - if len(sentences) % n_sents == 0: - doc = create_json_doc(raw, sentences, i) - docs.append(doc) - raw = "" - sentences = [] - if sentences: - doc = create_json_doc(raw, sentences, i) - docs.append(doc) + docs = [] + sent_docs_to_merge = [] + for sent_doc in sent_docs: + sent_docs_to_merge.append(sent_doc) + if len(sent_docs_to_merge) % n_sents == 0: + docs.append(Doc.from_docs(sent_docs_to_merge)) + sent_docs_to_merge = [] + if sent_docs_to_merge: + docs.append(Doc.from_docs(sent_docs_to_merge)) return docs @@ -84,14 +72,14 @@ def read_conllx( ner_tag_pattern="", ner_map=None, ): - """ Yield examples, one for each sentence """ + """ Yield docs, one for each sentence """ vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc for sent in input_data.strip().split("\n\n"): lines = sent.strip().split("\n") if lines: while lines[0].startswith("#"): lines.pop(0) - example = example_from_conllu_sentence( + doc = doc_from_conllu_sentence( vocab, lines, ner_tag_pattern, @@ -99,7 +87,7 @@ def read_conllx( append_morphology=append_morphology, ner_map=ner_map, ) - yield example + yield doc def get_entities(lines, tag_pattern, ner_map=None): @@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None): return iob_to_biluo(iob) -def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): - sentence = {} - tokens = [] - token_annotation = example_dict["token_annotation"] - for i, id_ in enumerate(token_annotation["ids"]): - token = {} - token["id"] = id_ - token["orth"] = token_annotation["words"][i] - token["tag"] = token_annotation["tags"][i] - token["pos"] = token_annotation["pos"][i] - token["lemma"] = token_annotation["lemmas"][i] - token["morph"] = token_annotation["morphs"][i] - token["head"] = token_annotation["heads"][i] - i - token["dep"] = token_annotation["deps"][i] - if has_ner_tags: - token["ner"] = example_dict["doc_annotation"]["entities"][i] - tokens.append(token) - sentence["tokens"] = tokens - return sentence - - -def create_json_doc(raw, sentences, id_): - doc = {} - paragraph = {} - doc["id"] = id_ - doc["paragraphs"] = [] - paragraph["raw"] = raw.strip() - paragraph["sentences"] = sentences - doc["paragraphs"].append(paragraph) - return doc - - -def example_from_conllu_sentence( +def doc_from_conllu_sentence( vocab, lines, ner_tag_pattern, @@ -263,8 +219,9 @@ def example_from_conllu_sentence( if merge_subtokens: doc = merge_conllu_subtokens(lines, doc) - # create Example from custom Doc annotation - words, spaces, tags, morphs, lemmas = [], [], [], [], [] + # create final Doc from custom Doc annotation + words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], [] + heads, deps = [], [] for i, t in enumerate(doc): words.append(t._.merged_orth) lemmas.append(t._.merged_lemma) @@ -274,16 +231,23 @@ def example_from_conllu_sentence( tags.append(t.tag_ + "__" + t._.merged_morph) else: tags.append(t.tag_) + poses.append(t.pos_) + heads.append(t.head.i) + deps.append(t.dep_) doc_x = Doc(vocab, words=words, spaces=spaces) - ref_dict = Example(doc_x, reference=doc).to_dict() - ref_dict["words"] = words - ref_dict["lemmas"] = lemmas - ref_dict["spaces"] = spaces - ref_dict["tags"] = tags - ref_dict["morphs"] = morphs - example = Example.from_dict(doc_x, ref_dict) - return example + for i in range(len(doc)): + doc_x[i].tag_ = tags[i] + doc_x[i].morph_ = morphs[i] + doc_x[i].lemma_ = lemmas[i] + doc_x[i].pos_ = poses[i] + doc_x[i].dep_ = deps[i] + doc_x[i].head = doc_x[heads[i]] + doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] + doc_x.is_parsed = True + doc_x.is_tagged = True + + return doc_x def merge_conllu_subtokens(lines, doc): diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 50ad16faf..342f94848 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs): for json_para in json_to_annotations(json_doc): example_dict = _fix_legacy_dict_data(json_para) tok_dict, doc_dict = _parse_example_dict_data(example_dict) - if json_para.get("raw"): - assert tok_dict.get("SPACY") doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9a688987c..64f38d21c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -43,25 +43,36 @@ class Corpus: locs.append(path) return locs + def _make_example(self, nlp, reference, gold_preproc): + if gold_preproc or reference.has_unknown_spaces: + return Example( + Doc( + nlp.vocab, + words=[word.text for word in reference], + spaces=[bool(word.whitespace_) for word in reference] + ), + reference + ) + else: + return Example( + nlp.make_doc(reference.text), + reference + ) + def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: if len(reference) == 0: continue elif max_length == 0 or len(reference) < max_length: - yield Example( - nlp.make_doc(reference.text), - reference - ) + yield self._make_example(nlp, reference, False) elif reference.is_sentenced: for ref_sent in reference.sents: if len(ref_sent) == 0: continue elif max_length == 0 or len(ref_sent) < max_length: - yield Example( - nlp.make_doc(ref_sent.text), - ref_sent.as_doc() - ) + yield self._make_example(nlp, ref_sent.as_doc(), False) + def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: if reference.is_sentenced: @@ -69,14 +80,7 @@ class Corpus: else: ref_sents = [reference] for ref_sent in ref_sents: - eg = Example( - Doc( - nlp.vocab, - words=[w.text for w in ref_sent], - spaces=[bool(w.whitespace_) for w in ref_sent] - ), - ref_sent - ) + eg = self._make_example(nlp, ref_sent, True) if len(eg.x): yield eg diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 5e36156a9..7b629dcd2 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -15,7 +15,7 @@ from ..syntax import nonproj cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): - """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """ + """ Create a Doc from dictionaries with token and doc annotations. """ attrs, array = _annot2array(vocab, tok_annot, doc_annot) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if "entities" in doc_annot: @@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot): if key == "entities": pass elif key == "links": - entities = doc_annot.get("entities", {}) - if not entities: - raise ValueError(Errors.E981) - ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) + ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value) tok_annot["ENT_KB_ID"] = ent_kb_ids elif key == "cats": pass @@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): ent_types.append("") return ent_iobs, ent_types -def _parse_links(vocab, words, links, entities): - reference = Doc(vocab, words=words) +def _parse_links(vocab, words, spaces, links): + reference = Doc(vocab, words=words, spaces=spaces) starts = {token.idx: token.i for token in reference} ends = {token.idx + len(token): token.i for token in reference} ent_kb_ids = ["" for _ in reference] - entity_map = [(ent[0], ent[1]) for ent in entities] - - # links annotations need to refer 1-1 to entity annotations - throw error otherwise - for index, annot_dict in links.items(): - start_char, end_char = index - if (start_char, end_char) not in entity_map: - raise ValueError(Errors.E981) for index, annot_dict in links.items(): true_kb_ids = [] @@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities): start_char, end_char = index start_token = starts.get(start_char) end_token = ends.get(end_char) + if start_token is None or end_token is None: + raise ValueError(Errors.E981) for i in range(start_token, end_token+1): ent_kb_ids[i] = true_kb_ids[0] @@ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities): def _guess_spaces(text, words): if text is None: - return [True] * len(words) + return None spaces = [] text_pos = 0 # align words with text diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 018830d37..38e6114de 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab): assert new_doc.is_parsed +def test_doc_api_from_docs(en_tokenizer, de_tokenizer): + en_texts = ["Merging the docs is fun.", "They don't think alike."] + de_text = "Wie war die Frage?" + en_docs = [en_tokenizer(text) for text in en_texts] + docs_idx = en_texts[0].index('docs') + de_doc = de_tokenizer(de_text) + en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None) + + assert Doc.from_docs([]) is None + + assert de_doc is not Doc.from_docs([de_doc]) + assert str(de_doc) == str(Doc.from_docs([de_doc])) + + with pytest.raises(ValueError): + Doc.from_docs(en_docs + [de_doc]) + + m_doc = Doc.from_docs(en_docs) + assert len(en_docs) == len(list(m_doc.sents)) + assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert str(m_doc) == " ".join(en_texts) + p_token = m_doc[len(en_docs[0])-1] + assert p_token.text == "." and bool(p_token.whitespace_) + en_docs_tokens = [t for doc in en_docs for t in doc] + assert len(m_doc) == len(en_docs_tokens) + think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + assert m_doc[9].idx == think_idx + with pytest.raises(AttributeError): + not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension + assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there + + m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) + assert len(en_docs) == len(list(m_doc.sents)) + assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1]) + assert str(m_doc) == "".join(en_texts) + p_token = m_doc[len(en_docs[0]) - 1] + assert p_token.text == "." and not bool(p_token.whitespace_) + en_docs_tokens = [t for doc in en_docs for t in doc] + assert len(m_doc) == len(en_docs_tokens) + think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think') + assert m_doc[9].idx == think_idx + + m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos']) + with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing + assert list(m_doc.sents) + assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing + p_token = m_doc[len(en_docs[0]) - 1] + assert p_token.text == "." and bool(p_token.whitespace_) + en_docs_tokens = [t for doc in en_docs for t in doc] + assert len(m_doc) == len(en_docs_tokens) + think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') + assert m_doc[9].idx == think_idx + + def test_doc_lang(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) assert doc.lang_ == "en" diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 615bb1cd9..85c21f7f9 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -75,3 +75,19 @@ def test_serialize_doc_bin(): for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats + + +def test_serialize_doc_bin_unknown_spaces(en_vocab): + doc1 = Doc(en_vocab, words=["that", "'s"]) + assert doc1.has_unknown_spaces + assert doc1.text == "that 's " + doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) + assert not doc2.has_unknown_spaces + assert doc2.text == "that's" + + doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) + re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) + assert re_doc1.has_unknown_spaces + assert re_doc1.text == "that 's " + assert not re_doc2.has_unknown_spaces + assert re_doc2.text == "that's" diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ca0f3710f..e8928f33a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,14 +1,10 @@ import pytest -from spacy.gold import docs_to_json -from spacy.gold.converters import iob2docs, conll_ner2docs -from spacy.gold.converters.conllu2json import conllu2json +from spacy.gold import docs_to_json, biluo_tags_from_offsets +from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English from spacy.cli.pretrain import make_docs -# TODO -# from spacy.gold.converters import conllu2docs - def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu @@ -19,8 +15,9 @@ def test_cli_converters_conllu2json(): "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", ] input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1) - assert len(converted) == 1 + converted_docs = conllu2docs(input_data, n_sents=1) + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 @@ -31,7 +28,9 @@ def test_cli_converters_conllu2json(): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] assert [t["head"] for t in tokens] == [1, 2, -1, 0] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] - assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "B-PER", "L-PER", "O"] @pytest.mark.parametrize( @@ -55,11 +54,12 @@ def test_cli_converters_conllu2json(): ) def test_cli_converters_conllu2json_name_ner_map(lines): input_data = "\n".join(lines) - converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) - assert len(converted) == 1 + converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 - assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 5 @@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines): assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] def test_cli_converters_conllu2json_subtokens(): @@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens(): "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", ] input_data = "\n".join(lines) - converted = conllu2json( + converted_docs = conllu2docs( input_data, n_sents=1, merge_subtokens=True, append_morphology=True ) - assert len(converted) == 1 + assert len(converted_docs) == 1 + converted = [docs_to_json(converted_docs)] + assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 - assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." + assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 sent = converted[0]["paragraphs"][0]["sentences"][0] assert len(sent["tokens"]) == 4 @@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens(): assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] assert [t["head"] for t in tokens] == [1, 1, 0, -1] assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] - assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] + ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] + biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") + assert biluo_tags == ["O", "U-PER", "O", "O"] def test_cli_converters_iob2json(en_vocab): diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index b89654554..58eab4a54 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots): [ { "words": ["I", "like", "New", "York", "and", "Berlin", "."], - "entities": [(7, 15, "LOC"), (20, 26, "LOC")], - "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, + "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}}, } ], ) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d16515a57..edc183e0d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors -ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") +ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") class DocBin(object): @@ -31,6 +31,7 @@ class DocBin(object): "spaces": bytes, # Serialized numpy boolean array with spaces data "lengths": bytes, # Serialized numpy int32 array with the doc lengths "strings": List[unicode] # List of unique strings in the token data + "version": str, # DocBin version number } Strings for the words, tags, labels etc are represented by 64-bit hashes in @@ -53,12 +54,14 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#init """ attrs = sorted([intify_attr(attr) for attr in attrs]) + self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] self.spaces = [] self.cats = [] self.user_data = [] + self.flags = [] self.strings = set() self.store_user_data = store_user_data for doc in docs: @@ -83,12 +86,17 @@ class DocBin(object): assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) + self.flags.append({ + "has_unknown_spaces": doc.has_unknown_spaces + }) for token in doc: self.strings.add(token.text) self.strings.add(token.tag_) self.strings.add(token.lemma_) + self.strings.add(token.morph_) self.strings.add(token.dep_) self.strings.add(token.ent_type_) + self.strings.add(token.ent_kb_id_) self.cats.append(doc.cats) if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) @@ -105,8 +113,11 @@ class DocBin(object): vocab[string] orth_col = self.attrs.index(ORTH) for i in range(len(self.tokens)): + flags = self.flags[i] tokens = self.tokens[i] spaces = self.spaces[i] + if flags.get("has_unknown_spaces"): + spaces = None doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = doc.from_array(self.attrs, tokens) doc.cats = self.cats[i] @@ -130,6 +141,7 @@ class DocBin(object): self.spaces.extend(other.spaces) self.strings.update(other.strings) self.cats.extend(other.cats) + self.flags.extend(other.flags) if self.store_user_data: self.user_data.extend(other.user_data) @@ -147,12 +159,14 @@ class DocBin(object): spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) msg = { + "version": self.version, "attrs": self.attrs, "tokens": tokens.tobytes("C"), "spaces": spaces.tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), "cats": self.cats, + "flags": self.flags, } if self.store_user_data: msg["user_data"] = self.user_data @@ -178,6 +192,7 @@ class DocBin(object): self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) self.cats = msg["cats"] + self.flags = msg.get("flags", [{} for _ in lengths]) if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 42918ab6d..2775aa97e 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -59,11 +59,14 @@ cdef class Doc: cdef public dict user_token_hooks cdef public dict user_span_hooks + cdef public bint has_unknown_spaces + cdef public list _py_tokens cdef int length cdef int max_length + cdef public object noun_chunks_iterator cdef object __weakref__ diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 28590e91e..723873e1f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t +import copy from collections import Counter import numpy import numpy.linalg @@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t -from ..attrs import intify_attrs, IDS +from ..attrs import intify_attr, intify_attrs, IDS from ..util import normalize_slice from ..compat import copy_reg, pickle from ..errors import Errors, Warnings @@ -171,8 +172,7 @@ cdef class Doc: raise ValueError(Errors.E046.format(name=name)) return Underscore.doc_extensions.pop(name) - def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None, - orths_and_spaces=None): + def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None): """Create a Doc object. vocab (Vocab): A vocabulary object, which must match any models you @@ -214,28 +214,25 @@ cdef class Doc: self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) cdef bint has_space - if orths_and_spaces is None and words is not None: - if spaces is None: - spaces = [True] * len(words) - elif len(spaces) != len(words): - raise ValueError(Errors.E027) - orths_and_spaces = zip(words, spaces) + if words is None and spaces is not None: + raise ValueError("words must be set if spaces is set") + elif spaces is None and words is not None: + self.has_unknown_spaces = True + else: + self.has_unknown_spaces = False + words = words if words is not None else [] + spaces = spaces if spaces is not None else ([True] * len(words)) + if len(spaces) != len(words): + raise ValueError(Errors.E027) cdef const LexemeC* lexeme - if orths_and_spaces is not None: - orths_and_spaces = list(orths_and_spaces) - for orth_space in orths_and_spaces: - if isinstance(orth_space, unicode): - lexeme = self.vocab.get(self.mem, orth_space) - has_space = True - elif isinstance(orth_space, bytes): - raise ValueError(Errors.E028.format(value=orth_space)) - elif isinstance(orth_space[0], unicode): - lexeme = self.vocab.get(self.mem, orth_space[0]) - has_space = orth_space[1] - else: - lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) - has_space = orth_space[1] - self.push_back(lexeme, has_space) + for word, has_space in zip(words, spaces): + if isinstance(word, unicode): + lexeme = self.vocab.get(self.mem, word) + elif isinstance(word, bytes): + raise ValueError(Errors.E028.format(value=word)) + else: + lexeme = self.vocab.get_by_orth(self.mem, word) + self.push_back(lexeme, has_space) # Tough to decide on policy for this. Is an empty doc tagged and parsed? # There's no information we'd like to add to it, so I guess so? if self.length == 0: @@ -806,7 +803,7 @@ cdef class Doc: attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in attrs] if array.dtype != numpy.uint64: - warnings.warn(Warnings.W028.format(type=array.dtype)) + warnings.warn(Warnings.W101.format(type=array.dtype)) if SENT_START in attrs and HEAD in attrs: raise ValueError(Errors.E032) @@ -882,6 +879,87 @@ cdef class Doc: set_children_from_heads(self.c, length) return self + @staticmethod + def from_docs(docs, ensure_whitespace=True, attrs=None): + """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share + the same `Vocab`. + + docs (list): A list of Doc objects. + ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace. + attrs (list): Optional list of attribute ID ints or attribute name strings. + RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given. + + DOCS: https://spacy.io/api/doc#from_docs + """ + if not docs: + return None + + vocab = {doc.vocab for doc in docs} + if len(vocab) > 1: + raise ValueError(Errors.E999) + (vocab,) = vocab + + if attrs is None: + attrs = [LEMMA, NORM] + if all(doc.is_nered for doc in docs): + attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE]) + # TODO: separate for is_morphed? + if all(doc.is_tagged for doc in docs): + attrs.extend([TAG, POS, MORPH]) + if all(doc.is_parsed for doc in docs): + attrs.extend([HEAD, DEP]) + else: + attrs.append(SENT_START) + else: + if any(isinstance(attr, str) for attr in attrs): # resolve attribute names + attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs + attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present + if SPACY not in attrs: + attrs.append(SPACY) + + concat_words = [] + concat_spaces = [] + concat_user_data = {} + char_offset = 0 + for doc in docs: + concat_words.extend(t.text for t in doc) + concat_spaces.extend(bool(t.whitespace_) for t in doc) + + for key, value in doc.user_data.items(): + if isinstance(key, tuple) and len(key) == 4: + data_type, name, start, end = key + if start is not None or end is not None: + start += char_offset + if end is not None: + end += char_offset + concat_user_data[(data_type, name, start, end)] = copy.copy(value) + else: + warnings.warn(Warnings.W101.format(name=name)) + else: + warnings.warn(Warnings.W102.format(key=key, value=value)) + char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1 + + arrays = [doc.to_array(attrs) for doc in docs] + + if ensure_whitespace: + spacy_index = attrs.index(SPACY) + for i, array in enumerate(arrays[:-1]): + if len(array) > 0 and not docs[i][-1].is_space: + array[-1][spacy_index] = 1 + token_offset = -1 + for doc in docs[:-1]: + token_offset += len(doc) + if not doc[-1].is_space: + concat_spaces[token_offset] = True + + concat_array = numpy.concatenate(arrays) + + concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data) + + concat_doc.from_array(attrs, concat_array) + + return concat_doc + def get_lca_matrix(self): """Calculates a matrix of Lowest Common Ancestors (LCA) for a given `Doc`, where LCA[i, j] is the index of the lowest common ancestor among @@ -1000,6 +1078,7 @@ cdef class Doc: "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, "cats": lambda: self.cats, + "has_unknown_spaces": lambda: self.has_unknown_spaces } for key in kwargs: if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): @@ -1032,6 +1111,7 @@ cdef class Doc: "cats": lambda b: None, "user_data_keys": lambda b: None, "user_data_values": lambda b: None, + "has_unknown_spaces": lambda b: None } for key in kwargs: if key in deserializers or key in ("user_data",): @@ -1052,6 +1132,8 @@ cdef class Doc: self.tensor = msg["tensor"] if "cats" not in exclude and "cats" in msg: self.cats = msg["cats"] + if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg: + self.has_unknown_spaces = msg["has_unknown_spaces"] start = 0 cdef const LexemeC* lex cdef unicode orth_ @@ -1123,50 +1205,6 @@ cdef class Doc: remove_label_if_necessary(attributes[i]) retokenizer.merge(span, attributes[i]) - def merge(self, int start_idx, int end_idx, *args, **attributes): - """Retokenize the document, such that the span at - `doc.text[start_idx : end_idx]` is merged into a single token. If - `start_idx` and `end_idx `do not mark start and end token boundaries, - the document remains unchanged. - - start_idx (int): Character index of the start of the slice to merge. - end_idx (int): Character index after the end of the slice to merge. - **attributes: Attributes to assign to the merged token. By default, - attributes are inherited from the syntactic root of the span. - RETURNS (Token): The newly merged token, or `None` if the start and end - indices did not fall at token boundaries. - """ - cdef unicode tag, lemma, ent_type - warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning) - # TODO: ENT_KB_ID ? - if len(args) == 3: - warnings.warn(Warnings.W003, DeprecationWarning) - tag, lemma, ent_type = args - attributes[TAG] = tag - attributes[LEMMA] = lemma - attributes[ENT_TYPE] = ent_type - elif not args: - fix_attributes(self, attributes) - elif args: - raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args), - kwargs=repr(attributes))) - remove_label_if_necessary(attributes) - attributes = intify_attrs(attributes, strings_map=self.vocab.strings) - cdef int start = token_by_start(self.c, self.length, start_idx) - if start == -1: - return None - cdef int end = token_by_end(self.c, self.length, end_idx) - if end == -1: - return None - # Currently we have the token index, we want the range-end index - end += 1 - with self.retokenize() as retokenizer: - retokenizer.merge(self[start:end], attrs=attributes) - return self[start] - - def print_tree(self, light=False, flat=False): - raise ValueError(Errors.E105) - def to_json(self, underscore=None): """Convert a Doc to JSON. The format it produces will be the new format for the `spacy train` command (not implemented yet). diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index b8f79f8a6..902d46f5a 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -280,18 +280,6 @@ cdef class Span: return array - def merge(self, *args, **attributes): - """Retokenize the document, such that the span is merged into a single - token. - - **attributes: Attributes to assign to the merged token. By default, - attributes are inherited from the syntactic root token of the span. - RETURNS (Token): The newly merged token. - """ - warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning) - return self.doc.merge(self.start_char, self.end_char, *args, - **attributes) - def get_lca_matrix(self): """Calculates a matrix of Lowest Common Ancestors (LCA) for a given `Span`, where LCA[i, j] is the index of the lowest common ancestor among diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index d585cbd25..3b31b2c80 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -349,6 +349,33 @@ array of attributes. | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Doc` | Itself. | + +## Doc.from_docs {#from_docs tag="staticmethod"} + +Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. + +> #### Example +> +> ```python +> from spacy.tokens import Doc +> texts = ["London is the capital of the United Kingdom.", +> "The River Thames flows through London.", +> "The famous Tower Bridge crosses the River Thames."] +> docs = list(nlp.pipe(texts)) +> c_doc = Doc.from_docs(docs) +> assert str(c_doc) == " ".join(texts) +> assert len(list(c_doc.sents)) == len(docs) +> assert [str(ent) for ent in c_doc.ents] == \ +> [str(ent) for doc in docs for ent in doc.ents] +> ``` + +| Name | Type | Description | +| ------------------- | ----- | ----------------------------------------------------------------------------------------------- | +| `docs` | list | A list of `Doc` objects. | +| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. | +| `attrs` | list | Optional list of attribute ID ints or attribute name strings. | +| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. | + ## Doc.to_disk {#to_disk tag="method" new="2"} Save the current state to a directory. diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index fe8c359f7..07f95f91d 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where the msgpack object has the following structure: ```python -### msgpack object strcutrue +### msgpack object structrue { + "version": str, # DocBin version number "attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] "tokens": bytes, # Serialized numpy uint64 array with the token data "spaces": bytes, # Serialized numpy boolean array with spaces data @@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations. | Argument | Type | Description | | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | +| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | **RETURNS** | `DocBin` | The newly constructed object. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 8210f7094..636354496 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names. ## Matcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. As of -spaCy v2.3, the `Matcher` can also be called on `Span` objects. +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > #### Example > @@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects. > > matcher = Matcher(nlp.vocab) > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] -> matcher.add("HelloWorld", None, pattern) +> matcher.add("HelloWorld", [pattern]) > doc = nlp("hello world!") > matches = matcher(doc) > ``` | Name | Type | Description | | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). | +| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | - - -By default, the matcher **does not perform any action** on matches, like tagging -matched phrases with entity types. Instead, actions need to be specified when -**adding patterns or entities**, by passing in a callback function as the -`on_match` argument on [`add`](/api/matcher#add). This allows you to define -custom actions per pattern within the same matcher. For example, you might only -want to merge some entity types, and set custom flags for other matched -patterns. For more details and examples, see the usage guide on -[rule-based matching](/usage/rule-based-matching). - - - ## Matcher.pipe {#pipe tag="method"} Match a stream of documents, yielding them in turn. @@ -92,7 +78,7 @@ patterns. > ```python > matcher = Matcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > assert len(matcher) == 1 > ``` @@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID. > > ```python > matcher = Matcher(nlp.vocab) -> assert 'Rule' not in matcher -> matcher.add('Rule', None, [{'ORTH': 'test'}]) -> assert 'Rule' in matcher +> assert "Rule" not in matcher +> matcher.add("Rule", [[{'ORTH': 'test'}]]) +> assert "Rule" in matcher > ``` | Name | Type | Description | @@ -129,39 +115,39 @@ overwritten. > #### Example > > ```python -> def on_match(matcher, doc, id, matches): -> print('Matched!', matches) +> def on_match(matcher, doc, id, matches): +> print('Matched!', matches) > -> matcher = Matcher(nlp.vocab) -> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) -> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) -> doc = nlp("HELLO WORLD on Google Maps.") -> matches = matcher(doc) +> matcher = Matcher(nlp.vocab) +> patterns = [ +> [{"LOWER": "hello"}, {"LOWER": "world"}], +> [{"ORTH": "Google"}, {"ORTH": "Maps"}] +> ] +> matcher.add("TEST_PATTERNS", patterns) +> doc = nlp("HELLO WORLD on Google Maps.") +> matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | -| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | + - - -As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become -the default in the future. The patterns are now the second argument and a list +As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. ```diff patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] -- matcher.add("GoogleNow", None, *patterns) -+ matcher.add("GoogleNow", patterns) - matcher.add("GoogleNow", on_match, *patterns) + matcher.add("GoogleNow", patterns, on_match=on_match) ``` +| Name | Type | Description | +| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + ## Matcher.remove {#remove tag="method" new="2"} Remove a rule from the matcher. A `KeyError` is raised if the match ID does not @@ -170,7 +156,7 @@ exist. > #### Example > > ```python -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > assert "Rule" in matcher > matcher.remove("Rule") > assert "Rule" not in matcher @@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > #### Example > > ```python -> matcher.add("Rule", None, [{"ORTH": "test"}]) +> matcher.add("Rule", [[{"ORTH": "test"}]]) > on_match, patterns = matcher.get("Rule") > ``` diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index f02d81de9..9c722297d 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. > from spacy.matcher import PhraseMatcher > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > doc = nlp("Barack Obama lifts America one last time in emotional farewell") > matches = matcher(doc) > ``` @@ -104,7 +104,7 @@ patterns. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert len(matcher) == 1 > ``` @@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert "OBAMA" not in matcher -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert "OBAMA" in matcher > ``` @@ -145,36 +145,32 @@ overwritten. > print('Matched!', matches) > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", on_match, nlp("Barack Obama")) -> matcher.add("HEALTH", on_match, nlp("health care reform"), -> nlp("healthcare reform")) +> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match) +> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match) > doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") > matches = matcher(doc) > ``` -| Name | Type | Description | -| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | -| `*docs` | `Doc` | `Doc` objects of the phrases to match. | + - - -As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will -become the default in the future. The `Doc` patterns are now the second argument -and a list (instead of a variable number of arguments). The `on_match` callback +As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second +argument (instead of a variable number of arguments). The `on_match` callback becomes an optional keyword argument. ```diff patterns = [nlp("health care reform"), nlp("healthcare reform")] -- matcher.add("HEALTH", None, *patterns) -+ matcher.add("HEALTH", patterns) - matcher.add("HEALTH", on_match, *patterns) + matcher.add("HEALTH", patterns, on_match=on_match) ``` +| Name | Type | Description | +| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `docs` | list | `Doc` objects of the phrases to match. | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + ## PhraseMatcher.remove {#remove tag="method" new="2.2"} Remove a rule from the matcher by match ID. A `KeyError` is raised if the key @@ -184,7 +180,7 @@ does not exist. > > ```python > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp("Barack Obama")) +> matcher.add("OBAMA", [nlp("Barack Obama")]) > assert "OBAMA" in matcher > matcher.remove("OBAMA") > assert "OBAMA" not in matcher diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0ead27a49..6b32dc422 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -407,7 +407,7 @@ class EntityMatcher(object): def __init__(self, nlp, terms, label): patterns = [nlp.make_doc(text) for text in terms] self.matcher = PhraseMatcher(nlp.vocab) - self.matcher.add(label, None, *patterns) + self.matcher.add(label, patterns) def __call__(self, doc): matches = self.matcher(doc) diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 9a8f3da7b..d0ee44e49 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -98,9 +98,7 @@ print([token.text for token in doc]) First, we initialize the `Matcher` with a vocab. The matcher must always share the same vocab with the documents it will operate on. We can now call -[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The -second argument lets you pass in an optional callback function to invoke on a -successful match. For now, we set it to `None`. +[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns. ```python ### {executable="true"} @@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) # Add match ID "HelloWorld" with no callback and one pattern pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] -matcher.add("HelloWorld", None, pattern) +matcher.add("HelloWorld", [pattern]) doc = nlp("Hello, world! Hello world!") matches = matcher(doc) @@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to also match sequences without punctuation between "hello" and "world": ```python -matcher.add("HelloWorld", None, - [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], - [{"LOWER": "hello"}, {"LOWER": "world"}]) +patterns = [ + [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], + [{"LOWER": "hello"}, {"LOWER": "world"}] +] +matcher.add("HelloWorld", patterns) ``` By default, the matcher will only return the matches and **not do anything @@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab, validate=True) # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] -matcher.add("HelloWorld", None, pattern) +matcher.add("HelloWorld", [pattern]) # 🚨 Raises an error: # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' # Pattern 0: @@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches): print(entity.text) pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] -matcher.add("GoogleIO", add_event_ent, pattern) +matcher.add("GoogleIO", [pattern], on_match=add_event_ent) doc = nlp("This is a text about Google I/O") matches = matcher(doc) ``` @@ -509,19 +509,18 @@ import spacy from spacy.matcher import Matcher from spacy.tokens import Token -# We're using a class because the component needs to be initialised with +# We're using a class because the component needs to be initialized with # the shared vocab via the nlp object class BadHTMLMerger(object): def __init__(self, nlp): + patterns = [ + [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], + [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], + ] # Register a new token extension to flag bad HTML Token.set_extension("bad_html", default=False) self.matcher = Matcher(nlp.vocab) - self.matcher.add( - "BAD_HTML", - None, - [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], - [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], - ) + self.matcher.add("BAD_HTML", patterns) def __call__(self, doc): # This method is invoked when the component is called on a Doc @@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches): pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}] -matcher.add("FacebookIs", collect_sents, pattern) # add pattern +matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?") matches = matcher(doc) @@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] -matcher.add("PHONE_NUMBER", None, pattern) +matcher.add("PHONE_NUMBER", [pattern]) doc = nlp("Call me at (123) 456 789 or (123) 456 789!") print([t.text for t in doc]) @@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches): elif doc.vocab.strings[match_id] == "SAD": doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment -matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern -matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern +matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern +matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern # Add pattern for valid hashtag, i.e. '#' plus any ASCII token -matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) +matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]]) doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) @@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab) terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(text) for text in terms] -matcher.add("TerminologyList", None, *patterns) +matcher.add("TerminologyList", patterns) doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " "converse in the Oval Office inside the White House in Washington, D.C.") @@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="LOWER") patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] -matcher.add("Names", None, *patterns) +matcher.add("Names", patterns) doc = nlp("angela merkel and us president barack Obama") for match_id, start, end in matcher(doc): @@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") -matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) +matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")]) doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") for match_id, start, end in matcher(doc): diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 39d732724..aa8aa59af 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab) def set_sentiment(matcher, doc, i, matches): doc.sentiment += 0.1 -pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] -pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] -matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" -matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji +pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]] +patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] +matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o" +matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji doc = nlp("A text about Google I/O 😀😀") matches = matcher(doc)