mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						949d4a0a0b
					
				|  | @ -1,6 +1,6 @@ | |||
| # fmt: off | ||||
| __title__ = "spacy-nightly" | ||||
| __version__ = "3.0.0a0" | ||||
| __version__ = "3.0.0a1" | ||||
| __release__ = True | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ import sys | |||
| from ._app import app, Arg, Opt | ||||
| from ..gold import docs_to_json | ||||
| from ..tokens import DocBin | ||||
| from ..gold.converters import iob2docs, conll_ner2docs, json2docs | ||||
| from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs | ||||
| 
 | ||||
| 
 | ||||
| # Converters are matched by file extension except for ner/iob, which are | ||||
|  | @ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs | |||
| # imported from /converters. | ||||
| 
 | ||||
| CONVERTERS = { | ||||
|     # "conllubio": conllu2docs, TODO | ||||
|     # "conllu": conllu2docs, TODO | ||||
|     # "conll": conllu2docs, TODO | ||||
|     "conllubio": conllu2docs, | ||||
|     "conllu": conllu2docs, | ||||
|     "conll": conllu2docs, | ||||
|     "ner": conll_ner2docs, | ||||
|     "iob": iob2docs, | ||||
|     "json": json2docs, | ||||
|  | @ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type): | |||
|     if output_type == "json": | ||||
|         srsly.write_json("-", docs_to_json(docs)) | ||||
|     else: | ||||
|         sys.stdout.buffer.write(DocBin(docs=docs).to_bytes()) | ||||
|         sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) | ||||
| 
 | ||||
| 
 | ||||
| def _write_docs_to_file(docs, output_file, output_type): | ||||
|  | @ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type): | |||
|     if output_type == "json": | ||||
|         srsly.write_json(output_file, docs_to_json(docs)) | ||||
|     else: | ||||
|         data = DocBin(docs=docs).to_bytes() | ||||
|         data = DocBin(docs=docs, store_user_data=True).to_bytes() | ||||
|         with output_file.open("wb") as file_: | ||||
|             file_.write(data) | ||||
|   | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ def init_model_cli( | |||
|     clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True), | ||||
|     jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True), | ||||
|     vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True), | ||||
|     prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"), | ||||
|     prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"), | ||||
|     truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), | ||||
|     vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), | ||||
|     model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"), | ||||
|  | @ -56,6 +56,7 @@ def init_model_cli( | |||
|         freqs_loc=freqs_loc, | ||||
|         clusters_loc=clusters_loc, | ||||
|         jsonl_loc=jsonl_loc, | ||||
|         vectors_loc=vectors_loc, | ||||
|         prune_vectors=prune_vectors, | ||||
|         truncate_vectors=truncate_vectors, | ||||
|         vectors_name=vectors_name, | ||||
|  | @ -228,7 +229,7 @@ def add_vectors( | |||
|     else: | ||||
|         if vectors_loc: | ||||
|             with msg.loading(f"Reading vectors from {vectors_loc}"): | ||||
|                 vectors_data, vector_keys = read_vectors(msg, vectors_loc) | ||||
|                 vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors) | ||||
|             msg.good(f"Loaded vectors from {vectors_loc}") | ||||
|         else: | ||||
|             vectors_data, vector_keys = (None, None) | ||||
|  | @ -247,7 +248,7 @@ def add_vectors( | |||
|         nlp.vocab.prune_vectors(prune_vectors) | ||||
| 
 | ||||
| 
 | ||||
| def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0): | ||||
| def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): | ||||
|     f = open_file(vectors_loc) | ||||
|     shape = tuple(int(size) for size in next(f).split()) | ||||
|     if truncate_vectors >= 1: | ||||
|  |  | |||
|  | @ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model | |||
| from ..tokens import Doc | ||||
| from ..attrs import ID, HEAD | ||||
| from .. import util | ||||
| from ..gold import Example | ||||
| 
 | ||||
| 
 | ||||
| @app.command("pretrain") | ||||
|  | @ -183,7 +182,7 @@ def pretrain( | |||
|         for batch_id, batch in enumerate(batches): | ||||
|             docs, count = make_docs( | ||||
|                 nlp, | ||||
|                 [ex.doc for ex in batch], | ||||
|                 batch, | ||||
|                 max_length=pretrain_config["max_length"], | ||||
|                 min_length=pretrain_config["min_length"], | ||||
|             ) | ||||
|  |  | |||
|  | @ -159,6 +159,8 @@ class Warnings(object): | |||
|     W100 = ("Skipping unsupported morphological feature(s): '{feature}'. " | ||||
|             "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " | ||||
|             "string \"Field1=Value1,Value2|Field2=Value3\".") | ||||
|     W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.") | ||||
|     W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  | @ -556,8 +558,8 @@ class Errors(object): | |||
|     E979 = ("Cannot convert {type} to an Example object.") | ||||
|     E980 = ("Each link annotation should refer to a dictionary with at most one " | ||||
|             "identifier mapping to 1.0, and all others to 0.0.") | ||||
|     E981 = ("The offsets of the annotations for 'links' need to refer exactly " | ||||
|             "to the offsets of the 'entities' annotations.") | ||||
|     E981 = ("The offsets of the annotations for 'links' could not be aligned " | ||||
|             "to token boundaries.") | ||||
|     E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing " | ||||
|             "into {values}, but found {value}.") | ||||
|     E983 = ("Invalid key for '{dict}': {key}. Available keys: " | ||||
|  | @ -593,6 +595,8 @@ class Errors(object): | |||
|     E997 = ("Tokenizer special cases are not allowed to modify the text. " | ||||
|             "This would map '{chunk}' to '{orth}' given token attributes " | ||||
|             "'{token_attrs}'.") | ||||
|     E999 = ("Unable to merge the `Doc` objects because they do not all share " | ||||
|             "the same `Vocab`.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  |  | |||
|  | @ -1,6 +1,4 @@ | |||
| from .iob2docs import iob2docs  # noqa: F401 | ||||
| from .conll_ner2docs import conll_ner2docs  # noqa: F401 | ||||
| from .json2docs import json2docs | ||||
| 
 | ||||
| # TODO: Update this one | ||||
| # from .conllu2docs import conllu2docs  # noqa: F401 | ||||
| from .conllu2docs import conllu2docs  # noqa: F401 | ||||
|  |  | |||
|  | @ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info | |||
| from ...gold import Example | ||||
| from ...gold import iob_to_biluo, spans_from_biluo_tags | ||||
| from ...language import Language | ||||
| from ...tokens import Doc, Token | ||||
| from ...tokens import Doc, Token, Span | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| 
 | ||||
| def conllu2json( | ||||
| def conllu2docs( | ||||
|     input_data, | ||||
|     n_sents=10, | ||||
|     append_morphology=False, | ||||
|  | @ -28,34 +28,22 @@ def conllu2json( | |||
|     MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$" | ||||
|     msg = Printer(no_print=no_print) | ||||
|     n_sents_info(msg, n_sents) | ||||
|     docs = [] | ||||
|     raw = "" | ||||
|     sentences = [] | ||||
|     conll_data = read_conllx( | ||||
|     sent_docs = read_conllx( | ||||
|         input_data, | ||||
|         append_morphology=append_morphology, | ||||
|         ner_tag_pattern=MISC_NER_PATTERN, | ||||
|         ner_map=ner_map, | ||||
|         merge_subtokens=merge_subtokens, | ||||
|     ) | ||||
|     has_ner_tags = has_ner(input_data, MISC_NER_PATTERN) | ||||
|     for i, example in enumerate(conll_data): | ||||
|         raw += example.text | ||||
|         sentences.append( | ||||
|             generate_sentence( | ||||
|                 example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map, | ||||
|             ) | ||||
|         ) | ||||
|         # Real-sized documents could be extracted using the comments on the | ||||
|         # conllu document | ||||
|         if len(sentences) % n_sents == 0: | ||||
|             doc = create_json_doc(raw, sentences, i) | ||||
|             docs.append(doc) | ||||
|             raw = "" | ||||
|             sentences = [] | ||||
|     if sentences: | ||||
|         doc = create_json_doc(raw, sentences, i) | ||||
|         docs.append(doc) | ||||
|     docs = [] | ||||
|     sent_docs_to_merge = [] | ||||
|     for sent_doc in sent_docs: | ||||
|         sent_docs_to_merge.append(sent_doc) | ||||
|         if len(sent_docs_to_merge) % n_sents == 0: | ||||
|             docs.append(Doc.from_docs(sent_docs_to_merge)) | ||||
|             sent_docs_to_merge = [] | ||||
|     if sent_docs_to_merge: | ||||
|         docs.append(Doc.from_docs(sent_docs_to_merge)) | ||||
|     return docs | ||||
| 
 | ||||
| 
 | ||||
|  | @ -84,14 +72,14 @@ def read_conllx( | |||
|     ner_tag_pattern="", | ||||
|     ner_map=None, | ||||
| ): | ||||
|     """ Yield examples, one for each sentence """ | ||||
|     """ Yield docs, one for each sentence """ | ||||
|     vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc | ||||
|     for sent in input_data.strip().split("\n\n"): | ||||
|         lines = sent.strip().split("\n") | ||||
|         if lines: | ||||
|             while lines[0].startswith("#"): | ||||
|                 lines.pop(0) | ||||
|             example = example_from_conllu_sentence( | ||||
|             doc = doc_from_conllu_sentence( | ||||
|                 vocab, | ||||
|                 lines, | ||||
|                 ner_tag_pattern, | ||||
|  | @ -99,7 +87,7 @@ def read_conllx( | |||
|                 append_morphology=append_morphology, | ||||
|                 ner_map=ner_map, | ||||
|             ) | ||||
|             yield example | ||||
|             yield doc | ||||
| 
 | ||||
| 
 | ||||
| def get_entities(lines, tag_pattern, ner_map=None): | ||||
|  | @ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None): | |||
|     return iob_to_biluo(iob) | ||||
| 
 | ||||
| 
 | ||||
| def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None): | ||||
|     sentence = {} | ||||
|     tokens = [] | ||||
|     token_annotation = example_dict["token_annotation"] | ||||
|     for i, id_ in enumerate(token_annotation["ids"]): | ||||
|         token = {} | ||||
|         token["id"] = id_ | ||||
|         token["orth"] = token_annotation["words"][i] | ||||
|         token["tag"] = token_annotation["tags"][i] | ||||
|         token["pos"] = token_annotation["pos"][i] | ||||
|         token["lemma"] = token_annotation["lemmas"][i] | ||||
|         token["morph"] = token_annotation["morphs"][i] | ||||
|         token["head"] = token_annotation["heads"][i] - i | ||||
|         token["dep"] = token_annotation["deps"][i] | ||||
|         if has_ner_tags: | ||||
|             token["ner"] = example_dict["doc_annotation"]["entities"][i] | ||||
|         tokens.append(token) | ||||
|     sentence["tokens"] = tokens | ||||
|     return sentence | ||||
| 
 | ||||
| 
 | ||||
| def create_json_doc(raw, sentences, id_): | ||||
|     doc = {} | ||||
|     paragraph = {} | ||||
|     doc["id"] = id_ | ||||
|     doc["paragraphs"] = [] | ||||
|     paragraph["raw"] = raw.strip() | ||||
|     paragraph["sentences"] = sentences | ||||
|     doc["paragraphs"].append(paragraph) | ||||
|     return doc | ||||
| 
 | ||||
| 
 | ||||
| def example_from_conllu_sentence( | ||||
| def doc_from_conllu_sentence( | ||||
|     vocab, | ||||
|     lines, | ||||
|     ner_tag_pattern, | ||||
|  | @ -263,8 +219,9 @@ def example_from_conllu_sentence( | |||
|     if merge_subtokens: | ||||
|         doc = merge_conllu_subtokens(lines, doc) | ||||
| 
 | ||||
|     # create Example from custom Doc annotation | ||||
|     words, spaces, tags, morphs, lemmas = [], [], [], [], [] | ||||
|     # create final Doc from custom Doc annotation | ||||
|     words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], [] | ||||
|     heads, deps = [], [] | ||||
|     for i, t in enumerate(doc): | ||||
|         words.append(t._.merged_orth) | ||||
|         lemmas.append(t._.merged_lemma) | ||||
|  | @ -274,16 +231,23 @@ def example_from_conllu_sentence( | |||
|             tags.append(t.tag_ + "__" + t._.merged_morph) | ||||
|         else: | ||||
|             tags.append(t.tag_) | ||||
|         poses.append(t.pos_) | ||||
|         heads.append(t.head.i) | ||||
|         deps.append(t.dep_) | ||||
| 
 | ||||
|     doc_x = Doc(vocab, words=words, spaces=spaces) | ||||
|     ref_dict = Example(doc_x, reference=doc).to_dict() | ||||
|     ref_dict["words"] = words | ||||
|     ref_dict["lemmas"] = lemmas | ||||
|     ref_dict["spaces"] = spaces | ||||
|     ref_dict["tags"] = tags | ||||
|     ref_dict["morphs"] = morphs | ||||
|     example = Example.from_dict(doc_x, ref_dict) | ||||
|     return example | ||||
|     for i in range(len(doc)): | ||||
|         doc_x[i].tag_ = tags[i] | ||||
|         doc_x[i].morph_ = morphs[i] | ||||
|         doc_x[i].lemma_ = lemmas[i] | ||||
|         doc_x[i].pos_ = poses[i] | ||||
|         doc_x[i].dep_ = deps[i] | ||||
|         doc_x[i].head = doc_x[heads[i]] | ||||
|     doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents] | ||||
|     doc_x.is_parsed = True | ||||
|     doc_x.is_tagged = True | ||||
| 
 | ||||
|     return doc_x | ||||
| 
 | ||||
| 
 | ||||
| def merge_conllu_subtokens(lines, doc): | ||||
|  | @ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs): | |||
|         for json_para in json_to_annotations(json_doc): | ||||
|             example_dict = _fix_legacy_dict_data(json_para) | ||||
|             tok_dict, doc_dict = _parse_example_dict_data(example_dict) | ||||
|             if json_para.get("raw"): | ||||
|                 assert tok_dict.get("SPACY") | ||||
|             doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) | ||||
|             docs.append(doc) | ||||
|     return docs | ||||
|  |  | |||
|  | @ -43,24 +43,35 @@ class Corpus: | |||
|                 locs.append(path) | ||||
|         return locs | ||||
| 
 | ||||
|     def _make_example(self, nlp, reference, gold_preproc): | ||||
|         if gold_preproc or reference.has_unknown_spaces: | ||||
|             return Example( | ||||
|                 Doc( | ||||
|                     nlp.vocab, | ||||
|                     words=[word.text for word in reference], | ||||
|                     spaces=[bool(word.whitespace_) for word in reference] | ||||
|                 ), | ||||
|                 reference | ||||
|             ) | ||||
|         else: | ||||
|             return Example( | ||||
|                 nlp.make_doc(reference.text), | ||||
|                 reference | ||||
|             ) | ||||
|   | ||||
|     def make_examples(self, nlp, reference_docs, max_length=0): | ||||
|         for reference in reference_docs: | ||||
|             if len(reference) == 0: | ||||
|                 continue | ||||
|             elif max_length == 0 or len(reference) < max_length: | ||||
|                 yield Example( | ||||
|                     nlp.make_doc(reference.text), | ||||
|                     reference | ||||
|                 ) | ||||
|                 yield self._make_example(nlp, reference, False) | ||||
|             elif reference.is_sentenced: | ||||
|                 for ref_sent in reference.sents: | ||||
|                     if len(ref_sent) == 0: | ||||
|                         continue | ||||
|                     elif max_length == 0 or len(ref_sent) < max_length: | ||||
|                         yield Example( | ||||
|                             nlp.make_doc(ref_sent.text), | ||||
|                             ref_sent.as_doc() | ||||
|                         ) | ||||
|                         yield self._make_example(nlp, ref_sent.as_doc(), False) | ||||
|      | ||||
| 
 | ||||
|     def make_examples_gold_preproc(self, nlp, reference_docs): | ||||
|         for reference in reference_docs: | ||||
|  | @ -69,14 +80,7 @@ class Corpus: | |||
|             else: | ||||
|                 ref_sents = [reference] | ||||
|             for ref_sent in ref_sents: | ||||
|                 eg = Example( | ||||
|                     Doc( | ||||
|                         nlp.vocab,  | ||||
|                         words=[w.text for w in ref_sent], | ||||
|                         spaces=[bool(w.whitespace_) for w in ref_sent] | ||||
|                     ), | ||||
|                     ref_sent | ||||
|                 ) | ||||
|                 eg = self._make_example(nlp, ref_sent, True) | ||||
|                 if len(eg.x): | ||||
|                     yield eg | ||||
| 
 | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ from ..syntax import nonproj | |||
| 
 | ||||
| 
 | ||||
| cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): | ||||
|     """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """ | ||||
|     """ Create a Doc from dictionaries with token and doc annotations. """ | ||||
|     attrs, array = _annot2array(vocab, tok_annot, doc_annot) | ||||
|     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) | ||||
|     if "entities" in doc_annot: | ||||
|  | @ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot): | |||
|             if key == "entities": | ||||
|                 pass | ||||
|             elif key == "links": | ||||
|                 entities = doc_annot.get("entities", {}) | ||||
|                 if not entities: | ||||
|                     raise ValueError(Errors.E981) | ||||
|                 ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities) | ||||
|                 ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value) | ||||
|                 tok_annot["ENT_KB_ID"] = ent_kb_ids | ||||
|             elif key == "cats": | ||||
|                 pass | ||||
|  | @ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces): | |||
|                 ent_types.append("") | ||||
|     return ent_iobs, ent_types | ||||
| 
 | ||||
| def _parse_links(vocab, words, links, entities): | ||||
|     reference = Doc(vocab, words=words) | ||||
| def _parse_links(vocab, words, spaces, links): | ||||
|     reference = Doc(vocab, words=words, spaces=spaces) | ||||
|     starts = {token.idx: token.i for token in reference} | ||||
|     ends = {token.idx + len(token): token.i for token in reference} | ||||
|     ent_kb_ids = ["" for _ in reference] | ||||
|     entity_map = [(ent[0], ent[1]) for ent in entities] | ||||
| 
 | ||||
|     # links annotations need to refer 1-1 to entity annotations - throw error otherwise | ||||
|     for index, annot_dict in links.items(): | ||||
|         start_char, end_char = index | ||||
|         if (start_char, end_char) not in entity_map: | ||||
|             raise ValueError(Errors.E981) | ||||
| 
 | ||||
|     for index, annot_dict in links.items(): | ||||
|         true_kb_ids = [] | ||||
|  | @ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities): | |||
|             start_char, end_char = index | ||||
|             start_token = starts.get(start_char) | ||||
|             end_token = ends.get(end_char) | ||||
|             if start_token is None or end_token is None: | ||||
|                 raise ValueError(Errors.E981) | ||||
|             for i in range(start_token, end_token+1): | ||||
|                 ent_kb_ids[i] = true_kb_ids[0] | ||||
| 
 | ||||
|  | @ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities): | |||
| 
 | ||||
| def _guess_spaces(text, words): | ||||
|     if text is None: | ||||
|         return [True] * len(words) | ||||
|         return None | ||||
|     spaces = [] | ||||
|     text_pos = 0 | ||||
|     # align words with text | ||||
|  |  | |||
|  | @ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab): | |||
|     assert new_doc.is_parsed | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_api_from_docs(en_tokenizer, de_tokenizer): | ||||
|     en_texts = ["Merging the docs is fun.", "They don't think alike."] | ||||
|     de_text = "Wie war die Frage?" | ||||
|     en_docs = [en_tokenizer(text) for text in en_texts] | ||||
|     docs_idx = en_texts[0].index('docs') | ||||
|     de_doc = de_tokenizer(de_text) | ||||
|     en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None) | ||||
| 
 | ||||
|     assert Doc.from_docs([]) is None | ||||
| 
 | ||||
|     assert de_doc is not Doc.from_docs([de_doc]) | ||||
|     assert str(de_doc) == str(Doc.from_docs([de_doc])) | ||||
| 
 | ||||
|     with pytest.raises(ValueError): | ||||
|         Doc.from_docs(en_docs + [de_doc]) | ||||
| 
 | ||||
|     m_doc = Doc.from_docs(en_docs) | ||||
|     assert len(en_docs) == len(list(m_doc.sents)) | ||||
|     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) | ||||
|     assert str(m_doc) == " ".join(en_texts) | ||||
|     p_token = m_doc[len(en_docs[0])-1] | ||||
|     assert p_token.text == "." and bool(p_token.whitespace_) | ||||
|     en_docs_tokens = [t for doc in en_docs for t in doc] | ||||
|     assert len(m_doc) == len(en_docs_tokens) | ||||
|     think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') | ||||
|     assert m_doc[9].idx == think_idx | ||||
|     with pytest.raises(AttributeError): | ||||
|         not_available = m_doc[2]._.is_ambiguous             # not callable, because it was not set via set_extension | ||||
|     assert len(m_doc.user_data) == len(en_docs[0].user_data)    # but it's there | ||||
| 
 | ||||
|     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) | ||||
|     assert len(en_docs) == len(list(m_doc.sents)) | ||||
|     assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1]) | ||||
|     assert str(m_doc) == "".join(en_texts) | ||||
|     p_token = m_doc[len(en_docs[0]) - 1] | ||||
|     assert p_token.text == "." and not bool(p_token.whitespace_) | ||||
|     en_docs_tokens = [t for doc in en_docs for t in doc] | ||||
|     assert len(m_doc) == len(en_docs_tokens) | ||||
|     think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think') | ||||
|     assert m_doc[9].idx == think_idx | ||||
| 
 | ||||
|     m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos']) | ||||
|     with pytest.raises(ValueError):                 # important attributes from sentenziser or parser are missing | ||||
|         assert list(m_doc.sents) | ||||
|     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) | ||||
|     assert str(m_doc) == " ".join(en_texts)         # space delimiter considered, although spacy attribute was missing | ||||
|     p_token = m_doc[len(en_docs[0]) - 1] | ||||
|     assert p_token.text == "." and bool(p_token.whitespace_) | ||||
|     en_docs_tokens = [t for doc in en_docs for t in doc] | ||||
|     assert len(m_doc) == len(en_docs_tokens) | ||||
|     think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think') | ||||
|     assert m_doc[9].idx == think_idx | ||||
| 
 | ||||
| 
 | ||||
| def test_doc_lang(en_vocab): | ||||
|     doc = Doc(en_vocab, words=["Hello", "world"]) | ||||
|     assert doc.lang_ == "en" | ||||
|  |  | |||
|  | @ -75,3 +75,19 @@ def test_serialize_doc_bin(): | |||
|     for i, doc in enumerate(reloaded_docs): | ||||
|         assert doc.text == texts[i] | ||||
|         assert doc.cats == cats | ||||
| 
 | ||||
| 
 | ||||
| def test_serialize_doc_bin_unknown_spaces(en_vocab): | ||||
|     doc1 = Doc(en_vocab, words=["that", "'s"]) | ||||
|     assert doc1.has_unknown_spaces | ||||
|     assert doc1.text == "that 's " | ||||
|     doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) | ||||
|     assert not doc2.has_unknown_spaces | ||||
|     assert doc2.text == "that's" | ||||
| 
 | ||||
|     doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) | ||||
|     re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) | ||||
|     assert re_doc1.has_unknown_spaces | ||||
|     assert re_doc1.text == "that 's " | ||||
|     assert not re_doc2.has_unknown_spaces | ||||
|     assert re_doc2.text == "that's" | ||||
|  |  | |||
|  | @ -1,14 +1,10 @@ | |||
| import pytest | ||||
| 
 | ||||
| from spacy.gold import docs_to_json | ||||
| from spacy.gold.converters import iob2docs, conll_ner2docs | ||||
| from spacy.gold.converters.conllu2json import conllu2json | ||||
| from spacy.gold import docs_to_json, biluo_tags_from_offsets | ||||
| from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs | ||||
| from spacy.lang.en import English | ||||
| from spacy.cli.pretrain import make_docs | ||||
| 
 | ||||
| # TODO | ||||
| # from spacy.gold.converters import conllu2docs | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conllu2json(): | ||||
|     # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu | ||||
|  | @ -19,8 +15,9 @@ def test_cli_converters_conllu2json(): | |||
|         "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", | ||||
|     ] | ||||
|     input_data = "\n".join(lines) | ||||
|     converted = conllu2json(input_data, n_sents=1) | ||||
|     assert len(converted) == 1 | ||||
|     converted_docs = conllu2docs(input_data, n_sents=1) | ||||
|     assert len(converted_docs) == 1 | ||||
|     converted = [docs_to_json(converted_docs)] | ||||
|     assert converted[0]["id"] == 0 | ||||
|     assert len(converted[0]["paragraphs"]) == 1 | ||||
|     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 | ||||
|  | @ -31,7 +28,9 @@ def test_cli_converters_conllu2json(): | |||
|     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] | ||||
|     assert [t["head"] for t in tokens] == [1, 2, -1, 0] | ||||
|     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] | ||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] | ||||
|     ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] | ||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") | ||||
|     assert biluo_tags == ["O", "B-PER", "L-PER", "O"] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|  | @ -55,11 +54,12 @@ def test_cli_converters_conllu2json(): | |||
| ) | ||||
| def test_cli_converters_conllu2json_name_ner_map(lines): | ||||
|     input_data = "\n".join(lines) | ||||
|     converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) | ||||
|     assert len(converted) == 1 | ||||
|     converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}) | ||||
|     assert len(converted_docs) == 1 | ||||
|     converted = [docs_to_json(converted_docs)] | ||||
|     assert converted[0]["id"] == 0 | ||||
|     assert len(converted[0]["paragraphs"]) == 1 | ||||
|     assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår." | ||||
|     assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. " | ||||
|     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 | ||||
|     sent = converted[0]["paragraphs"][0]["sentences"][0] | ||||
|     assert len(sent["tokens"]) == 5 | ||||
|  | @ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines): | |||
|     assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"] | ||||
|     assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1] | ||||
|     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"] | ||||
|     assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] | ||||
|     ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] | ||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") | ||||
|     assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_conllu2json_subtokens(): | ||||
|  | @ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens(): | |||
|         "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", | ||||
|     ] | ||||
|     input_data = "\n".join(lines) | ||||
|     converted = conllu2json( | ||||
|     converted_docs = conllu2docs( | ||||
|         input_data, n_sents=1, merge_subtokens=True, append_morphology=True | ||||
|     ) | ||||
|     assert len(converted) == 1 | ||||
|     assert len(converted_docs) == 1 | ||||
|     converted = [docs_to_json(converted_docs)] | ||||
| 
 | ||||
|     assert converted[0]["id"] == 0 | ||||
|     assert len(converted[0]["paragraphs"]) == 1 | ||||
|     assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår." | ||||
|     assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. " | ||||
|     assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 | ||||
|     sent = converted[0]["paragraphs"][0]["sentences"][0] | ||||
|     assert len(sent["tokens"]) == 4 | ||||
|  | @ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens(): | |||
|     assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."] | ||||
|     assert [t["head"] for t in tokens] == [1, 1, 0, -1] | ||||
|     assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"] | ||||
|     assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] | ||||
|     ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]] | ||||
|     biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") | ||||
|     assert biluo_tags == ["O", "U-PER", "O", "O"] | ||||
| 
 | ||||
| 
 | ||||
| def test_cli_converters_iob2json(en_vocab): | ||||
|  |  | |||
|  | @ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots): | |||
|     [ | ||||
|         { | ||||
|             "words": ["I", "like", "New", "York", "and", "Berlin", "."], | ||||
|             "entities": [(7, 15, "LOC"), (20, 26, "LOC")], | ||||
|             "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}}, | ||||
|             "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}}, | ||||
|         } | ||||
|     ], | ||||
| ) | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr | |||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH") | ||||
| ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") | ||||
| 
 | ||||
| 
 | ||||
| class DocBin(object): | ||||
|  | @ -31,6 +31,7 @@ class DocBin(object): | |||
|         "spaces": bytes, # Serialized numpy boolean array with spaces data | ||||
|         "lengths": bytes, # Serialized numpy int32 array with the doc lengths | ||||
|         "strings": List[unicode] # List of unique strings in the token data | ||||
|         "version": str, # DocBin version number | ||||
|     } | ||||
| 
 | ||||
|     Strings for the words, tags, labels etc are represented by 64-bit hashes in | ||||
|  | @ -53,12 +54,14 @@ class DocBin(object): | |||
|         DOCS: https://spacy.io/api/docbin#init | ||||
|         """ | ||||
|         attrs = sorted([intify_attr(attr) for attr in attrs]) | ||||
|         self.version = "0.1" | ||||
|         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] | ||||
|         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0] | ||||
|         self.tokens = [] | ||||
|         self.spaces = [] | ||||
|         self.cats = [] | ||||
|         self.user_data = [] | ||||
|         self.flags = [] | ||||
|         self.strings = set() | ||||
|         self.store_user_data = store_user_data | ||||
|         for doc in docs: | ||||
|  | @ -83,12 +86,17 @@ class DocBin(object): | |||
|         assert array.shape[0] == spaces.shape[0]  # this should never happen | ||||
|         spaces = spaces.reshape((spaces.shape[0], 1)) | ||||
|         self.spaces.append(numpy.asarray(spaces, dtype=bool)) | ||||
|         self.flags.append({ | ||||
|             "has_unknown_spaces": doc.has_unknown_spaces | ||||
|         }) | ||||
|         for token in doc: | ||||
|             self.strings.add(token.text) | ||||
|             self.strings.add(token.tag_) | ||||
|             self.strings.add(token.lemma_) | ||||
|             self.strings.add(token.morph_) | ||||
|             self.strings.add(token.dep_) | ||||
|             self.strings.add(token.ent_type_) | ||||
|             self.strings.add(token.ent_kb_id_) | ||||
|         self.cats.append(doc.cats) | ||||
|         if self.store_user_data: | ||||
|             self.user_data.append(srsly.msgpack_dumps(doc.user_data)) | ||||
|  | @ -105,8 +113,11 @@ class DocBin(object): | |||
|             vocab[string] | ||||
|         orth_col = self.attrs.index(ORTH) | ||||
|         for i in range(len(self.tokens)): | ||||
|             flags = self.flags[i] | ||||
|             tokens = self.tokens[i] | ||||
|             spaces = self.spaces[i] | ||||
|             if flags.get("has_unknown_spaces"): | ||||
|                 spaces = None | ||||
|             doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) | ||||
|             doc = doc.from_array(self.attrs, tokens) | ||||
|             doc.cats = self.cats[i] | ||||
|  | @ -130,6 +141,7 @@ class DocBin(object): | |||
|         self.spaces.extend(other.spaces) | ||||
|         self.strings.update(other.strings) | ||||
|         self.cats.extend(other.cats) | ||||
|         self.flags.extend(other.flags) | ||||
|         if self.store_user_data: | ||||
|             self.user_data.extend(other.user_data) | ||||
| 
 | ||||
|  | @ -147,12 +159,14 @@ class DocBin(object): | |||
|         spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) | ||||
| 
 | ||||
|         msg = { | ||||
|             "version": self.version, | ||||
|             "attrs": self.attrs, | ||||
|             "tokens": tokens.tobytes("C"), | ||||
|             "spaces": spaces.tobytes("C"), | ||||
|             "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), | ||||
|             "strings": list(self.strings), | ||||
|             "cats": self.cats, | ||||
|             "flags": self.flags, | ||||
|         } | ||||
|         if self.store_user_data: | ||||
|             msg["user_data"] = self.user_data | ||||
|  | @ -178,6 +192,7 @@ class DocBin(object): | |||
|         self.tokens = NumpyOps().unflatten(flat_tokens, lengths) | ||||
|         self.spaces = NumpyOps().unflatten(flat_spaces, lengths) | ||||
|         self.cats = msg["cats"] | ||||
|         self.flags = msg.get("flags", [{} for _ in lengths]) | ||||
|         if self.store_user_data and "user_data" in msg: | ||||
|             self.user_data = list(msg["user_data"]) | ||||
|         for tokens in self.tokens: | ||||
|  |  | |||
|  | @ -59,11 +59,14 @@ cdef class Doc: | |||
|     cdef public dict user_token_hooks | ||||
|     cdef public dict user_span_hooks | ||||
| 
 | ||||
|     cdef public bint has_unknown_spaces | ||||
| 
 | ||||
|     cdef public list _py_tokens | ||||
| 
 | ||||
|     cdef int length | ||||
|     cdef int max_length | ||||
| 
 | ||||
| 
 | ||||
|     cdef public object noun_chunks_iterator | ||||
| 
 | ||||
|     cdef object __weakref__ | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from libc.string cimport memcpy, memset | |||
| from libc.math cimport sqrt | ||||
| from libc.stdint cimport int32_t, uint64_t | ||||
| 
 | ||||
| import copy | ||||
| from collections import Counter | ||||
| import numpy | ||||
| import numpy.linalg | ||||
|  | @ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB | |||
| from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t | ||||
| from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | ||||
| 
 | ||||
| from ..attrs import intify_attrs, IDS | ||||
| from ..attrs import intify_attr, intify_attrs, IDS | ||||
| from ..util import normalize_slice | ||||
| from ..compat import copy_reg, pickle | ||||
| from ..errors import Errors, Warnings | ||||
|  | @ -171,8 +172,7 @@ cdef class Doc: | |||
|             raise ValueError(Errors.E046.format(name=name)) | ||||
|         return Underscore.doc_extensions.pop(name) | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None, | ||||
|                  orths_and_spaces=None): | ||||
|     def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None): | ||||
|         """Create a Doc object. | ||||
| 
 | ||||
|         vocab (Vocab): A vocabulary object, which must match any models you | ||||
|  | @ -214,27 +214,24 @@ cdef class Doc: | |||
|         self._vector = None | ||||
|         self.noun_chunks_iterator = _get_chunker(self.vocab.lang) | ||||
|         cdef bint has_space | ||||
|         if orths_and_spaces is None and words is not None: | ||||
|             if spaces is None: | ||||
|                 spaces = [True] * len(words) | ||||
|             elif len(spaces) != len(words): | ||||
|                 raise ValueError(Errors.E027) | ||||
|             orths_and_spaces = zip(words, spaces) | ||||
|         cdef const LexemeC* lexeme | ||||
|         if orths_and_spaces is not None: | ||||
|             orths_and_spaces = list(orths_and_spaces) | ||||
|             for orth_space in orths_and_spaces: | ||||
|                 if isinstance(orth_space, unicode): | ||||
|                     lexeme = self.vocab.get(self.mem, orth_space) | ||||
|                     has_space = True | ||||
|                 elif isinstance(orth_space, bytes): | ||||
|                     raise ValueError(Errors.E028.format(value=orth_space)) | ||||
|                 elif isinstance(orth_space[0], unicode): | ||||
|                     lexeme = self.vocab.get(self.mem, orth_space[0]) | ||||
|                     has_space = orth_space[1] | ||||
|         if words is None and spaces is not None: | ||||
|             raise ValueError("words must be set if spaces is set") | ||||
|         elif spaces is None and words is not None: | ||||
|             self.has_unknown_spaces = True | ||||
|         else: | ||||
|                     lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) | ||||
|                     has_space = orth_space[1] | ||||
|             self.has_unknown_spaces = False | ||||
|         words = words if words is not None else [] | ||||
|         spaces = spaces if spaces is not None else ([True] * len(words)) | ||||
|         if len(spaces) != len(words): | ||||
|             raise ValueError(Errors.E027) | ||||
|         cdef const LexemeC* lexeme | ||||
|         for word, has_space in zip(words, spaces): | ||||
|             if isinstance(word, unicode): | ||||
|                 lexeme = self.vocab.get(self.mem, word) | ||||
|             elif isinstance(word, bytes): | ||||
|                 raise ValueError(Errors.E028.format(value=word)) | ||||
|             else: | ||||
|                 lexeme = self.vocab.get_by_orth(self.mem, word) | ||||
|             self.push_back(lexeme, has_space) | ||||
|         # Tough to decide on policy for this. Is an empty doc tagged and parsed? | ||||
|         # There's no information we'd like to add to it, so I guess so? | ||||
|  | @ -806,7 +803,7 @@ cdef class Doc: | |||
|         attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) | ||||
|                  for id_ in attrs] | ||||
|         if array.dtype != numpy.uint64: | ||||
|             warnings.warn(Warnings.W028.format(type=array.dtype)) | ||||
|             warnings.warn(Warnings.W101.format(type=array.dtype)) | ||||
| 
 | ||||
|         if SENT_START in attrs and HEAD in attrs: | ||||
|             raise ValueError(Errors.E032) | ||||
|  | @ -882,6 +879,87 @@ cdef class Doc: | |||
|             set_children_from_heads(self.c, length) | ||||
|         return self | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def from_docs(docs, ensure_whitespace=True, attrs=None): | ||||
|         """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share | ||||
|         the same `Vocab`. | ||||
| 
 | ||||
|         docs (list): A list of Doc objects. | ||||
|         ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace. | ||||
|         attrs (list): Optional list of attribute ID ints or attribute name strings. | ||||
|         RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_docs | ||||
|         """ | ||||
|         if not docs: | ||||
|             return None | ||||
| 
 | ||||
|         vocab = {doc.vocab for doc in docs} | ||||
|         if len(vocab) > 1: | ||||
|             raise ValueError(Errors.E999) | ||||
|         (vocab,) = vocab | ||||
| 
 | ||||
|         if attrs is None: | ||||
|             attrs = [LEMMA, NORM] | ||||
|             if all(doc.is_nered for doc in docs): | ||||
|                 attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE]) | ||||
|             # TODO: separate for is_morphed? | ||||
|             if all(doc.is_tagged for doc in docs): | ||||
|                 attrs.extend([TAG, POS, MORPH]) | ||||
|             if all(doc.is_parsed for doc in docs): | ||||
|                 attrs.extend([HEAD, DEP]) | ||||
|             else: | ||||
|                 attrs.append(SENT_START) | ||||
|         else: | ||||
|             if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names | ||||
|                 attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs | ||||
|             attrs = list(attr for attr in set(attrs) if attr)    # filter duplicates, remove None if present | ||||
|         if SPACY not in attrs: | ||||
|             attrs.append(SPACY) | ||||
| 
 | ||||
|         concat_words = [] | ||||
|         concat_spaces = [] | ||||
|         concat_user_data = {} | ||||
|         char_offset = 0 | ||||
|         for doc in docs: | ||||
|             concat_words.extend(t.text for t in doc) | ||||
|             concat_spaces.extend(bool(t.whitespace_) for t in doc) | ||||
| 
 | ||||
|             for key, value in doc.user_data.items(): | ||||
|                 if isinstance(key, tuple) and len(key) == 4: | ||||
|                     data_type, name, start, end = key | ||||
|                     if start is not None or end is not None: | ||||
|                         start += char_offset | ||||
|                         if end is not None: | ||||
|                             end += char_offset | ||||
|                         concat_user_data[(data_type, name, start, end)] = copy.copy(value) | ||||
|                     else: | ||||
|                         warnings.warn(Warnings.W101.format(name=name)) | ||||
|                 else: | ||||
|                     warnings.warn(Warnings.W102.format(key=key, value=value)) | ||||
|             char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1 | ||||
| 
 | ||||
|         arrays = [doc.to_array(attrs) for doc in docs] | ||||
| 
 | ||||
|         if ensure_whitespace: | ||||
|             spacy_index = attrs.index(SPACY) | ||||
|             for i, array in enumerate(arrays[:-1]): | ||||
|                 if len(array) > 0 and not docs[i][-1].is_space: | ||||
|                     array[-1][spacy_index] = 1 | ||||
|             token_offset = -1 | ||||
|             for doc in docs[:-1]: | ||||
|                 token_offset += len(doc) | ||||
|                 if not doc[-1].is_space: | ||||
|                     concat_spaces[token_offset] = True | ||||
| 
 | ||||
|         concat_array = numpy.concatenate(arrays) | ||||
| 
 | ||||
|         concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data) | ||||
| 
 | ||||
|         concat_doc.from_array(attrs, concat_array) | ||||
| 
 | ||||
|         return concat_doc | ||||
| 
 | ||||
|     def get_lca_matrix(self): | ||||
|         """Calculates a matrix of Lowest Common Ancestors (LCA) for a given | ||||
|         `Doc`, where LCA[i, j] is the index of the lowest common ancestor among | ||||
|  | @ -1000,6 +1078,7 @@ cdef class Doc: | |||
|             "sentiment": lambda: self.sentiment, | ||||
|             "tensor": lambda: self.tensor, | ||||
|             "cats": lambda: self.cats, | ||||
|             "has_unknown_spaces": lambda: self.has_unknown_spaces | ||||
|         } | ||||
|         for key in kwargs: | ||||
|             if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): | ||||
|  | @ -1032,6 +1111,7 @@ cdef class Doc: | |||
|             "cats": lambda b: None, | ||||
|             "user_data_keys": lambda b: None, | ||||
|             "user_data_values": lambda b: None, | ||||
|             "has_unknown_spaces": lambda b: None | ||||
|         } | ||||
|         for key in kwargs: | ||||
|             if key in deserializers or key in ("user_data",): | ||||
|  | @ -1052,6 +1132,8 @@ cdef class Doc: | |||
|             self.tensor = msg["tensor"] | ||||
|         if "cats" not in exclude and "cats" in msg: | ||||
|             self.cats = msg["cats"] | ||||
|         if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg: | ||||
|             self.has_unknown_spaces = msg["has_unknown_spaces"] | ||||
|         start = 0 | ||||
|         cdef const LexemeC* lex | ||||
|         cdef unicode orth_ | ||||
|  | @ -1123,50 +1205,6 @@ cdef class Doc: | |||
|                 remove_label_if_necessary(attributes[i]) | ||||
|                 retokenizer.merge(span, attributes[i]) | ||||
| 
 | ||||
|     def merge(self, int start_idx, int end_idx, *args, **attributes): | ||||
|         """Retokenize the document, such that the span at | ||||
|         `doc.text[start_idx : end_idx]` is merged into a single token. If | ||||
|         `start_idx` and `end_idx `do not mark start and end token boundaries, | ||||
|         the document remains unchanged. | ||||
| 
 | ||||
|         start_idx (int): Character index of the start of the slice to merge. | ||||
|         end_idx (int): Character index after the end of the slice to merge. | ||||
|         **attributes: Attributes to assign to the merged token. By default, | ||||
|             attributes are inherited from the syntactic root of the span. | ||||
|         RETURNS (Token): The newly merged token, or `None` if the start and end | ||||
|             indices did not fall at token boundaries. | ||||
|         """ | ||||
|         cdef unicode tag, lemma, ent_type | ||||
|         warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning) | ||||
|         # TODO: ENT_KB_ID ? | ||||
|         if len(args) == 3: | ||||
|             warnings.warn(Warnings.W003, DeprecationWarning) | ||||
|             tag, lemma, ent_type = args | ||||
|             attributes[TAG] = tag | ||||
|             attributes[LEMMA] = lemma | ||||
|             attributes[ENT_TYPE] = ent_type | ||||
|         elif not args: | ||||
|             fix_attributes(self, attributes) | ||||
|         elif args: | ||||
|             raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args), | ||||
|                                                 kwargs=repr(attributes))) | ||||
|         remove_label_if_necessary(attributes) | ||||
|         attributes = intify_attrs(attributes, strings_map=self.vocab.strings) | ||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||
|         if start == -1: | ||||
|             return None | ||||
|         cdef int end = token_by_end(self.c, self.length, end_idx) | ||||
|         if end == -1: | ||||
|             return None | ||||
|         # Currently we have the token index, we want the range-end index | ||||
|         end += 1 | ||||
|         with self.retokenize() as retokenizer: | ||||
|             retokenizer.merge(self[start:end], attrs=attributes) | ||||
|         return self[start] | ||||
| 
 | ||||
|     def print_tree(self, light=False, flat=False): | ||||
|         raise ValueError(Errors.E105) | ||||
| 
 | ||||
|     def to_json(self, underscore=None): | ||||
|         """Convert a Doc to JSON. The format it produces will be the new format | ||||
|         for the `spacy train` command (not implemented yet). | ||||
|  |  | |||
|  | @ -280,18 +280,6 @@ cdef class Span: | |||
| 
 | ||||
|         return array | ||||
| 
 | ||||
|     def merge(self, *args, **attributes): | ||||
|         """Retokenize the document, such that the span is merged into a single | ||||
|         token. | ||||
| 
 | ||||
|         **attributes: Attributes to assign to the merged token. By default, | ||||
|             attributes are inherited from the syntactic root token of the span. | ||||
|         RETURNS (Token): The newly merged token. | ||||
|         """ | ||||
|         warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning) | ||||
|         return self.doc.merge(self.start_char, self.end_char, *args, | ||||
|                               **attributes) | ||||
| 
 | ||||
|     def get_lca_matrix(self): | ||||
|         """Calculates a matrix of Lowest Common Ancestors (LCA) for a given | ||||
|         `Span`, where LCA[i, j] is the index of the lowest common ancestor among | ||||
|  |  | |||
|  | @ -349,6 +349,33 @@ array of attributes. | |||
| | `exclude`   | list                                   | String names of [serialization fields](#serialization-fields) to exclude. | | ||||
| | **RETURNS** | `Doc`                                  | Itself.                                                                   | | ||||
| 
 | ||||
| 
 | ||||
| ## Doc.from_docs {#from_docs tag="staticmethod"} | ||||
| 
 | ||||
| Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import Doc | ||||
| > texts = ["London is the capital of the United Kingdom.", | ||||
| >          "The River Thames flows through London.", | ||||
| >          "The famous Tower Bridge crosses the River Thames."] | ||||
| > docs = list(nlp.pipe(texts)) | ||||
| > c_doc = Doc.from_docs(docs) | ||||
| > assert str(c_doc) == " ".join(texts) | ||||
| > assert len(list(c_doc.sents)) == len(docs) | ||||
| > assert [str(ent) for ent in c_doc.ents] == \ | ||||
| >        [str(ent) for doc in docs for ent in doc.ents] | ||||
| > ``` | ||||
| 
 | ||||
| | Name                | Type  | Description                                                                                     | | ||||
| | ------------------- | ----- | ----------------------------------------------------------------------------------------------- | | ||||
| | `docs`              | list  | A list of `Doc` objects.                                                                        | | ||||
| | `ensure_whitespace` | bool  | Insert a space between two adjacent docs whenever the first doc does not end in whitespace.     | | ||||
| | `attrs`             | list  | Optional list of attribute ID ints or attribute name strings.                                   | | ||||
| | **RETURNS**         | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. | | ||||
| 
 | ||||
| ## Doc.to_disk {#to_disk tag="method" new="2"} | ||||
| 
 | ||||
| Save the current state to a directory. | ||||
|  |  | |||
|  | @ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where | |||
| the msgpack object has the following structure: | ||||
| 
 | ||||
| ```python | ||||
| ### msgpack object strcutrue | ||||
| ### msgpack object structrue | ||||
| { | ||||
|     "version": str,           # DocBin version number | ||||
|     "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE] | ||||
|     "tokens": bytes,          # Serialized numpy uint64 array with the token data | ||||
|     "spaces": bytes,          # Serialized numpy boolean array with spaces data | ||||
|  | @ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations. | |||
| 
 | ||||
| | Argument          | Type     | Description                                                                                                                                                                                | | ||||
| | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | ||||
| | `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | ||||
| | `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 | | ||||
| | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              | | ||||
| 
 | ||||
|  |  | |||
|  | @ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names. | |||
| 
 | ||||
| ## Matcher.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
| Find all token sequences matching the supplied patterns on the `Doc`. As of | ||||
| spaCy v2.3, the `Matcher` can also be called on `Span` objects. | ||||
| Find all token sequences matching the supplied patterns on the `Doc` or `Span`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects. | |||
| > | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] | ||||
| > matcher.add("HelloWorld", None, pattern) | ||||
| > matcher.add("HelloWorld", [pattern]) | ||||
| > doc = nlp("hello world!") | ||||
| > matches = matcher(doc) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type         | Description                                                                                                                                                              | | ||||
| | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `doclike`   | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).                                                                                                                     | | ||||
| | `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       | | ||||
| | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
| By default, the matcher **does not perform any action** on matches, like tagging | ||||
| matched phrases with entity types. Instead, actions need to be specified when | ||||
| **adding patterns or entities**, by passing in a callback function as the | ||||
| `on_match` argument on [`add`](/api/matcher#add). This allows you to define | ||||
| custom actions per pattern within the same matcher. For example, you might only | ||||
| want to merge some entity types, and set custom flags for other matched | ||||
| patterns. For more details and examples, see the usage guide on | ||||
| [rule-based matching](/usage/rule-based-matching). | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## Matcher.pipe {#pipe tag="method"} | ||||
| 
 | ||||
| Match a stream of documents, yielding them in turn. | ||||
|  | @ -92,7 +78,7 @@ patterns. | |||
| > ```python | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > assert len(matcher) == 0 | ||||
| > matcher.add("Rule", None, [{"ORTH": "test"}]) | ||||
| > matcher.add("Rule", [[{"ORTH": "test"}]]) | ||||
| > assert len(matcher) == 1 | ||||
| > ``` | ||||
| 
 | ||||
|  | @ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID. | |||
| > | ||||
| > ```python | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| > assert 'Rule' not in matcher | ||||
| > matcher.add('Rule', None, [{'ORTH': 'test'}]) | ||||
| > assert 'Rule' in matcher | ||||
| > assert "Rule" not in matcher | ||||
| > matcher.add("Rule", [[{'ORTH': 'test'}]]) | ||||
| > assert "Rule" in matcher | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type | Description                                           | | ||||
|  | @ -133,35 +119,35 @@ overwritten. | |||
| >     print('Matched!', matches) | ||||
| > | ||||
| > matcher = Matcher(nlp.vocab) | ||||
| >   matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) | ||||
| >   matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) | ||||
| > patterns = [ | ||||
| >    [{"LOWER": "hello"}, {"LOWER": "world"}], | ||||
| >    [{"ORTH": "Google"}, {"ORTH": "Maps"}] | ||||
| > ] | ||||
| > matcher.add("TEST_PATTERNS", patterns) | ||||
| > doc = nlp("HELLO WORLD on Google Maps.") | ||||
| > matches = matcher(doc) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type               | Description                                                                                   | | ||||
| | ----------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id`  | str                | An ID for the thing you're matching.                                                          | | ||||
| | `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| | `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      | | ||||
| <Infobox title="Changed in v3.0" variant="warning"> | ||||
| 
 | ||||
| <Infobox title="Changed in v2.2.2" variant="warning"> | ||||
| 
 | ||||
| As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become | ||||
| the default in the future. The patterns are now the second argument and a list | ||||
| As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument | ||||
| (instead of a variable number of arguments). The `on_match` callback becomes an | ||||
| optional keyword argument. | ||||
| 
 | ||||
| ```diff | ||||
| patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] | ||||
| - matcher.add("GoogleNow", None, *patterns) | ||||
| + matcher.add("GoogleNow", patterns) | ||||
| - matcher.add("GoogleNow", on_match, *patterns) | ||||
| + matcher.add("GoogleNow", patterns, on_match=on_match) | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| | Name       | Type               | Description                                                                                   | | ||||
| | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id` | str                | An ID for the thing you're matching.                                                          | | ||||
| | `patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      | | ||||
| | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| 
 | ||||
| ## Matcher.remove {#remove tag="method" new="2"} | ||||
| 
 | ||||
| Remove a rule from the matcher. A `KeyError` is raised if the match ID does not | ||||
|  | @ -170,7 +156,7 @@ exist. | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher.add("Rule", None, [{"ORTH": "test"}]) | ||||
| > matcher.add("Rule", [[{"ORTH": "test"}]]) | ||||
| > assert "Rule" in matcher | ||||
| > matcher.remove("Rule") | ||||
| > assert "Rule" not in matcher | ||||
|  | @ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > matcher.add("Rule", None, [{"ORTH": "test"}]) | ||||
| > matcher.add("Rule", [[{"ORTH": "test"}]]) | ||||
| > on_match, patterns = matcher.get("Rule") | ||||
| > ``` | ||||
| 
 | ||||
|  |  | |||
|  | @ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. | |||
| > from spacy.matcher import PhraseMatcher | ||||
| > | ||||
| > matcher = PhraseMatcher(nlp.vocab) | ||||
| > matcher.add("OBAMA", None, nlp("Barack Obama")) | ||||
| > matcher.add("OBAMA", [nlp("Barack Obama")]) | ||||
| > doc = nlp("Barack Obama lifts America one last time in emotional farewell") | ||||
| > matches = matcher(doc) | ||||
| > ``` | ||||
|  | @ -104,7 +104,7 @@ patterns. | |||
| > ```python | ||||
| >   matcher = PhraseMatcher(nlp.vocab) | ||||
| >   assert len(matcher) == 0 | ||||
| >   matcher.add("OBAMA", None, nlp("Barack Obama")) | ||||
| >   matcher.add("OBAMA", [nlp("Barack Obama")]) | ||||
| >   assert len(matcher) == 1 | ||||
| > ``` | ||||
| 
 | ||||
|  | @ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID. | |||
| > ```python | ||||
| >   matcher = PhraseMatcher(nlp.vocab) | ||||
| >   assert "OBAMA" not in matcher | ||||
| >   matcher.add("OBAMA", None, nlp("Barack Obama")) | ||||
| >   matcher.add("OBAMA", [nlp("Barack Obama")]) | ||||
| >   assert "OBAMA" in matcher | ||||
| > ``` | ||||
| 
 | ||||
|  | @ -145,36 +145,32 @@ overwritten. | |||
| >       print('Matched!', matches) | ||||
| > | ||||
| >   matcher = PhraseMatcher(nlp.vocab) | ||||
| >   matcher.add("OBAMA", on_match, nlp("Barack Obama")) | ||||
| >   matcher.add("HEALTH", on_match, nlp("health care reform"), | ||||
| >                                   nlp("healthcare reform")) | ||||
| >   matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match) | ||||
| >   matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match) | ||||
| >   doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") | ||||
| >   matches = matcher(doc) | ||||
| > ``` | ||||
| 
 | ||||
| | Name       | Type               | Description                                                                                   | | ||||
| | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id` | str                | An ID for the thing you're matching.                                                          | | ||||
| | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| | `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        | | ||||
| <Infobox title="Changed in v3.0" variant="warning"> | ||||
| 
 | ||||
| <Infobox title="Changed in v2.2.2" variant="warning"> | ||||
| 
 | ||||
| As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will | ||||
| become the default in the future. The `Doc` patterns are now the second argument | ||||
| and a list (instead of a variable number of arguments). The `on_match` callback | ||||
| As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second | ||||
| argument (instead of a variable number of arguments). The `on_match` callback | ||||
| becomes an optional keyword argument. | ||||
| 
 | ||||
| ```diff | ||||
| patterns = [nlp("health care reform"), nlp("healthcare reform")] | ||||
| - matcher.add("HEALTH", None, *patterns) | ||||
| + matcher.add("HEALTH", patterns) | ||||
| - matcher.add("HEALTH", on_match, *patterns) | ||||
| + matcher.add("HEALTH", patterns, on_match=on_match) | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| | Name       | Type               | Description                                                                                   | | ||||
| | ---------- | ------------------ | --------------------------------------------------------------------------------------------- | | ||||
| | `match_id` | str                | An ID for the thing you're matching.                                                          | | ||||
| | `docs`     | list               | `Doc` objects of the phrases to match.                                                        | | ||||
| | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | ||||
| 
 | ||||
| ## PhraseMatcher.remove {#remove tag="method" new="2.2"} | ||||
| 
 | ||||
| Remove a rule from the matcher by match ID. A `KeyError` is raised if the key | ||||
|  | @ -184,7 +180,7 @@ does not exist. | |||
| > | ||||
| > ```python | ||||
| > matcher = PhraseMatcher(nlp.vocab) | ||||
| > matcher.add("OBAMA", None, nlp("Barack Obama")) | ||||
| > matcher.add("OBAMA", [nlp("Barack Obama")]) | ||||
| > assert "OBAMA" in matcher | ||||
| > matcher.remove("OBAMA") | ||||
| > assert "OBAMA" not in matcher | ||||
|  |  | |||
|  | @ -407,7 +407,7 @@ class EntityMatcher(object): | |||
|     def __init__(self, nlp, terms, label): | ||||
|         patterns = [nlp.make_doc(text) for text in terms] | ||||
|         self.matcher = PhraseMatcher(nlp.vocab) | ||||
|         self.matcher.add(label, None, *patterns) | ||||
|         self.matcher.add(label, patterns) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         matches = self.matcher(doc) | ||||
|  |  | |||
|  | @ -98,9 +98,7 @@ print([token.text for token in doc]) | |||
| 
 | ||||
| First, we initialize the `Matcher` with a vocab. The matcher must always share | ||||
| the same vocab with the documents it will operate on. We can now call | ||||
| [`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The | ||||
| second argument lets you pass in an optional callback function to invoke on a | ||||
| successful match. For now, we set it to `None`. | ||||
| [`matcher.add()`](/api/matcher#add) with an ID and a list of patterns. | ||||
| 
 | ||||
| ```python | ||||
| ### {executable="true"} | ||||
|  | @ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm") | |||
| matcher = Matcher(nlp.vocab) | ||||
| # Add match ID "HelloWorld" with no callback and one pattern | ||||
| pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] | ||||
| matcher.add("HelloWorld", None, pattern) | ||||
| matcher.add("HelloWorld", [pattern]) | ||||
| 
 | ||||
| doc = nlp("Hello, world! Hello world!") | ||||
| matches = matcher(doc) | ||||
|  | @ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to | |||
| also match sequences without punctuation between "hello" and "world": | ||||
| 
 | ||||
| ```python | ||||
| matcher.add("HelloWorld", None, | ||||
| patterns = [ | ||||
|     [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}], | ||||
|             [{"LOWER": "hello"}, {"LOWER": "world"}]) | ||||
|     [{"LOWER": "hello"}, {"LOWER": "world"}] | ||||
| ] | ||||
| matcher.add("HelloWorld", patterns) | ||||
| ``` | ||||
| 
 | ||||
| By default, the matcher will only return the matches and **not do anything | ||||
|  | @ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm") | |||
| matcher = Matcher(nlp.vocab, validate=True) | ||||
| # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE | ||||
| pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}] | ||||
| matcher.add("HelloWorld", None, pattern) | ||||
| matcher.add("HelloWorld", [pattern]) | ||||
| # 🚨 Raises an error: | ||||
| # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld' | ||||
| # Pattern 0: | ||||
|  | @ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches): | |||
|     print(entity.text) | ||||
| 
 | ||||
| pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] | ||||
| matcher.add("GoogleIO", add_event_ent, pattern) | ||||
| matcher.add("GoogleIO", [pattern], on_match=add_event_ent) | ||||
| doc = nlp("This is a text about Google I/O") | ||||
| matches = matcher(doc) | ||||
| ``` | ||||
|  | @ -509,19 +509,18 @@ import spacy | |||
| from spacy.matcher import Matcher | ||||
| from spacy.tokens import Token | ||||
| 
 | ||||
| # We're using a class because the component needs to be initialised with | ||||
| # We're using a class because the component needs to be initialized with | ||||
| # the shared vocab via the nlp object | ||||
| class BadHTMLMerger(object): | ||||
|     def __init__(self, nlp): | ||||
|         patterns = [ | ||||
|             [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], | ||||
|             [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], | ||||
|         ] | ||||
|         # Register a new token extension to flag bad HTML | ||||
|         Token.set_extension("bad_html", default=False) | ||||
|         self.matcher = Matcher(nlp.vocab) | ||||
|         self.matcher.add( | ||||
|             "BAD_HTML", | ||||
|             None, | ||||
|             [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], | ||||
|             [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], | ||||
|         ) | ||||
|         self.matcher.add("BAD_HTML", patterns) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         # This method is invoked when the component is called on a Doc | ||||
|  | @ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches): | |||
| 
 | ||||
| pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, | ||||
|            {"POS": "ADJ"}] | ||||
| matcher.add("FacebookIs", collect_sents, pattern)  # add pattern | ||||
| matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern | ||||
| doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?") | ||||
| matches = matcher(doc) | ||||
| 
 | ||||
|  | @ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm") | |||
| matcher = Matcher(nlp.vocab) | ||||
| pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, | ||||
|            {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] | ||||
| matcher.add("PHONE_NUMBER", None, pattern) | ||||
| matcher.add("PHONE_NUMBER", [pattern]) | ||||
| 
 | ||||
| doc = nlp("Call me at (123) 456 789 or (123) 456 789!") | ||||
| print([t.text for t in doc]) | ||||
|  | @ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches): | |||
|     elif doc.vocab.strings[match_id] == "SAD": | ||||
|         doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment | ||||
| 
 | ||||
| matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern | ||||
| matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern | ||||
| matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern | ||||
| matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern | ||||
| 
 | ||||
| # Add pattern for valid hashtag, i.e. '#' plus any ASCII token | ||||
| matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) | ||||
| matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]]) | ||||
| 
 | ||||
| doc = nlp("Hello world 😀 #MondayMotivation") | ||||
| matches = matcher(doc) | ||||
|  | @ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab) | |||
| terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] | ||||
| # Only run nlp.make_doc to speed things up | ||||
| patterns = [nlp.make_doc(text) for text in terms] | ||||
| matcher.add("TerminologyList", None, *patterns) | ||||
| matcher.add("TerminologyList", patterns) | ||||
| 
 | ||||
| doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " | ||||
|           "converse in the Oval Office inside the White House in Washington, D.C.") | ||||
|  | @ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher | |||
| nlp = English() | ||||
| matcher = PhraseMatcher(nlp.vocab, attr="LOWER") | ||||
| patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] | ||||
| matcher.add("Names", None, *patterns) | ||||
| matcher.add("Names", patterns) | ||||
| 
 | ||||
| doc = nlp("angela merkel and us president barack Obama") | ||||
| for match_id, start, end in matcher(doc): | ||||
|  | @ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher | |||
| 
 | ||||
| nlp = English() | ||||
| matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") | ||||
| matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) | ||||
| matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")]) | ||||
| 
 | ||||
| doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") | ||||
| for match_id, start, end in matcher(doc): | ||||
|  |  | |||
|  | @ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab) | |||
| def set_sentiment(matcher, doc, i, matches): | ||||
|     doc.sentiment += 0.1 | ||||
| 
 | ||||
| pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] | ||||
| pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] | ||||
| matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o" | ||||
| matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji | ||||
| pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]] | ||||
| patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]] | ||||
| matcher.add("GoogleIO", patterns1)  # Match "Google I/O" or "Google i/o" | ||||
| matcher.add("HAPPY", patterns2, on_match=set_sentiment)  # Match one or more happy emoji | ||||
| 
 | ||||
| doc = nlp("A text about Google I/O 😀😀") | ||||
| matches = matcher(doc) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user