mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
949d4a0a0b
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a0"
|
||||
__version__ = "3.0.0a1"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -9,7 +9,7 @@ import sys
|
|||
from ._app import app, Arg, Opt
|
||||
from ..gold import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
|
|||
# imported from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
# "conllubio": conllu2docs, TODO
|
||||
# "conllu": conllu2docs, TODO
|
||||
# "conll": conllu2docs, TODO
|
||||
"conllubio": conllu2docs,
|
||||
"conllu": conllu2docs,
|
||||
"conll": conllu2docs,
|
||||
"ner": conll_ner2docs,
|
||||
"iob": iob2docs,
|
||||
"json": json2docs,
|
||||
|
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
|
|||
if output_type == "json":
|
||||
srsly.write_json("-", docs_to_json(docs))
|
||||
else:
|
||||
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
|
||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||
|
||||
|
||||
def _write_docs_to_file(docs, output_file, output_type):
|
||||
|
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
|
|||
if output_type == "json":
|
||||
srsly.write_json(output_file, docs_to_json(docs))
|
||||
else:
|
||||
data = DocBin(docs=docs).to_bytes()
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ def init_model_cli(
|
|||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
||||
prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
||||
|
@ -56,6 +56,7 @@ def init_model_cli(
|
|||
freqs_loc=freqs_loc,
|
||||
clusters_loc=clusters_loc,
|
||||
jsonl_loc=jsonl_loc,
|
||||
vectors_loc=vectors_loc,
|
||||
prune_vectors=prune_vectors,
|
||||
truncate_vectors=truncate_vectors,
|
||||
vectors_name=vectors_name,
|
||||
|
@ -228,7 +229,7 @@ def add_vectors(
|
|||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||
vectors_data, vector_keys = read_vectors(msg, vectors_loc)
|
||||
vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
|
||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
|
@ -247,7 +248,7 @@ def add_vectors(
|
|||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
|
||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
if truncate_vectors >= 1:
|
||||
|
|
|
@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model
|
|||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from .. import util
|
||||
from ..gold import Example
|
||||
|
||||
|
||||
@app.command("pretrain")
|
||||
|
@ -183,7 +182,7 @@ def pretrain(
|
|||
for batch_id, batch in enumerate(batches):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
[ex.doc for ex in batch],
|
||||
batch,
|
||||
max_length=pretrain_config["max_length"],
|
||||
min_length=pretrain_config["min_length"],
|
||||
)
|
||||
|
|
|
@ -159,6 +159,8 @@ class Warnings(object):
|
|||
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
||||
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -556,8 +558,8 @@ class Errors(object):
|
|||
E979 = ("Cannot convert {type} to an Example object.")
|
||||
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
||||
"identifier mapping to 1.0, and all others to 0.0.")
|
||||
E981 = ("The offsets of the annotations for 'links' need to refer exactly "
|
||||
"to the offsets of the 'entities' annotations.")
|
||||
E981 = ("The offsets of the annotations for 'links' could not be aligned "
|
||||
"to token boundaries.")
|
||||
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||
"into {values}, but found {value}.")
|
||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||
|
@ -593,6 +595,8 @@ class Errors(object):
|
|||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
"'{token_attrs}'.")
|
||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
from .iob2docs import iob2docs # noqa: F401
|
||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||
from .json2docs import json2docs
|
||||
|
||||
# TODO: Update this one
|
||||
# from .conllu2docs import conllu2docs # noqa: F401
|
||||
from .conllu2docs import conllu2docs # noqa: F401
|
||||
|
|
|
@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
|
|||
from ...gold import Example
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token
|
||||
from ...tokens import Doc, Token, Span
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def conllu2json(
|
||||
def conllu2docs(
|
||||
input_data,
|
||||
n_sents=10,
|
||||
append_morphology=False,
|
||||
|
@ -28,34 +28,22 @@ def conllu2json(
|
|||
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
|
||||
msg = Printer(no_print=no_print)
|
||||
n_sents_info(msg, n_sents)
|
||||
docs = []
|
||||
raw = ""
|
||||
sentences = []
|
||||
conll_data = read_conllx(
|
||||
sent_docs = read_conllx(
|
||||
input_data,
|
||||
append_morphology=append_morphology,
|
||||
ner_tag_pattern=MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
merge_subtokens=merge_subtokens,
|
||||
)
|
||||
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
|
||||
for i, example in enumerate(conll_data):
|
||||
raw += example.text
|
||||
sentences.append(
|
||||
generate_sentence(
|
||||
example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
|
||||
)
|
||||
)
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conllu document
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_json_doc(raw, sentences, i)
|
||||
docs.append(doc)
|
||||
raw = ""
|
||||
sentences = []
|
||||
if sentences:
|
||||
doc = create_json_doc(raw, sentences, i)
|
||||
docs.append(doc)
|
||||
docs = []
|
||||
sent_docs_to_merge = []
|
||||
for sent_doc in sent_docs:
|
||||
sent_docs_to_merge.append(sent_doc)
|
||||
if len(sent_docs_to_merge) % n_sents == 0:
|
||||
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||
sent_docs_to_merge = []
|
||||
if sent_docs_to_merge:
|
||||
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||
return docs
|
||||
|
||||
|
||||
|
@ -84,14 +72,14 @@ def read_conllx(
|
|||
ner_tag_pattern="",
|
||||
ner_map=None,
|
||||
):
|
||||
""" Yield examples, one for each sentence """
|
||||
""" Yield docs, one for each sentence """
|
||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
example = example_from_conllu_sentence(
|
||||
doc = doc_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
|
@ -99,7 +87,7 @@ def read_conllx(
|
|||
append_morphology=append_morphology,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
yield example
|
||||
yield doc
|
||||
|
||||
|
||||
def get_entities(lines, tag_pattern, ner_map=None):
|
||||
|
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
|||
return iob_to_biluo(iob)
|
||||
|
||||
|
||||
def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
|
||||
sentence = {}
|
||||
tokens = []
|
||||
token_annotation = example_dict["token_annotation"]
|
||||
for i, id_ in enumerate(token_annotation["ids"]):
|
||||
token = {}
|
||||
token["id"] = id_
|
||||
token["orth"] = token_annotation["words"][i]
|
||||
token["tag"] = token_annotation["tags"][i]
|
||||
token["pos"] = token_annotation["pos"][i]
|
||||
token["lemma"] = token_annotation["lemmas"][i]
|
||||
token["morph"] = token_annotation["morphs"][i]
|
||||
token["head"] = token_annotation["heads"][i] - i
|
||||
token["dep"] = token_annotation["deps"][i]
|
||||
if has_ner_tags:
|
||||
token["ner"] = example_dict["doc_annotation"]["entities"][i]
|
||||
tokens.append(token)
|
||||
sentence["tokens"] = tokens
|
||||
return sentence
|
||||
|
||||
|
||||
def create_json_doc(raw, sentences, id_):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id_
|
||||
doc["paragraphs"] = []
|
||||
paragraph["raw"] = raw.strip()
|
||||
paragraph["sentences"] = sentences
|
||||
doc["paragraphs"].append(paragraph)
|
||||
return doc
|
||||
|
||||
|
||||
def example_from_conllu_sentence(
|
||||
def doc_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
|
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
|
|||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
||||
# create Example from custom Doc annotation
|
||||
words, spaces, tags, morphs, lemmas = [], [], [], [], []
|
||||
# create final Doc from custom Doc annotation
|
||||
words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
|
||||
heads, deps = [], []
|
||||
for i, t in enumerate(doc):
|
||||
words.append(t._.merged_orth)
|
||||
lemmas.append(t._.merged_lemma)
|
||||
|
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
|
|||
tags.append(t.tag_ + "__" + t._.merged_morph)
|
||||
else:
|
||||
tags.append(t.tag_)
|
||||
poses.append(t.pos_)
|
||||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
|
||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
||||
ref_dict = Example(doc_x, reference=doc).to_dict()
|
||||
ref_dict["words"] = words
|
||||
ref_dict["lemmas"] = lemmas
|
||||
ref_dict["spaces"] = spaces
|
||||
ref_dict["tags"] = tags
|
||||
ref_dict["morphs"] = morphs
|
||||
example = Example.from_dict(doc_x, ref_dict)
|
||||
return example
|
||||
for i in range(len(doc)):
|
||||
doc_x[i].tag_ = tags[i]
|
||||
doc_x[i].morph_ = morphs[i]
|
||||
doc_x[i].lemma_ = lemmas[i]
|
||||
doc_x[i].pos_ = poses[i]
|
||||
doc_x[i].dep_ = deps[i]
|
||||
doc_x[i].head = doc_x[heads[i]]
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
doc_x.is_parsed = True
|
||||
doc_x.is_tagged = True
|
||||
|
||||
return doc_x
|
||||
|
||||
|
||||
def merge_conllu_subtokens(lines, doc):
|
|
@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
|
|||
for json_para in json_to_annotations(json_doc):
|
||||
example_dict = _fix_legacy_dict_data(json_para)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
if json_para.get("raw"):
|
||||
assert tok_dict.get("SPACY")
|
||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
|
|
@ -43,24 +43,35 @@ class Corpus:
|
|||
locs.append(path)
|
||||
return locs
|
||||
|
||||
def _make_example(self, nlp, reference, gold_preproc):
|
||||
if gold_preproc or reference.has_unknown_spaces:
|
||||
return Example(
|
||||
Doc(
|
||||
nlp.vocab,
|
||||
words=[word.text for word in reference],
|
||||
spaces=[bool(word.whitespace_) for word in reference]
|
||||
),
|
||||
reference
|
||||
)
|
||||
else:
|
||||
return Example(
|
||||
nlp.make_doc(reference.text),
|
||||
reference
|
||||
)
|
||||
|
||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
||||
for reference in reference_docs:
|
||||
if len(reference) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(reference) < max_length:
|
||||
yield Example(
|
||||
nlp.make_doc(reference.text),
|
||||
reference
|
||||
)
|
||||
yield self._make_example(nlp, reference, False)
|
||||
elif reference.is_sentenced:
|
||||
for ref_sent in reference.sents:
|
||||
if len(ref_sent) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(ref_sent) < max_length:
|
||||
yield Example(
|
||||
nlp.make_doc(ref_sent.text),
|
||||
ref_sent.as_doc()
|
||||
)
|
||||
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||
|
||||
|
||||
def make_examples_gold_preproc(self, nlp, reference_docs):
|
||||
for reference in reference_docs:
|
||||
|
@ -69,14 +80,7 @@ class Corpus:
|
|||
else:
|
||||
ref_sents = [reference]
|
||||
for ref_sent in ref_sents:
|
||||
eg = Example(
|
||||
Doc(
|
||||
nlp.vocab,
|
||||
words=[w.text for w in ref_sent],
|
||||
spaces=[bool(w.whitespace_) for w in ref_sent]
|
||||
),
|
||||
ref_sent
|
||||
)
|
||||
eg = self._make_example(nlp, ref_sent, True)
|
||||
if len(eg.x):
|
||||
yield eg
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from ..syntax import nonproj
|
|||
|
||||
|
||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
|
||||
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
if "entities" in doc_annot:
|
||||
|
@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
|||
if key == "entities":
|
||||
pass
|
||||
elif key == "links":
|
||||
entities = doc_annot.get("entities", {})
|
||||
if not entities:
|
||||
raise ValueError(Errors.E981)
|
||||
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
|
||||
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
|
||||
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
||||
elif key == "cats":
|
||||
pass
|
||||
|
@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
ent_types.append("")
|
||||
return ent_iobs, ent_types
|
||||
|
||||
def _parse_links(vocab, words, links, entities):
|
||||
reference = Doc(vocab, words=words)
|
||||
def _parse_links(vocab, words, spaces, links):
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
starts = {token.idx: token.i for token in reference}
|
||||
ends = {token.idx + len(token): token.i for token in reference}
|
||||
ent_kb_ids = ["" for _ in reference]
|
||||
entity_map = [(ent[0], ent[1]) for ent in entities]
|
||||
|
||||
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
|
||||
for index, annot_dict in links.items():
|
||||
start_char, end_char = index
|
||||
if (start_char, end_char) not in entity_map:
|
||||
raise ValueError(Errors.E981)
|
||||
|
||||
for index, annot_dict in links.items():
|
||||
true_kb_ids = []
|
||||
|
@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities):
|
|||
start_char, end_char = index
|
||||
start_token = starts.get(start_char)
|
||||
end_token = ends.get(end_char)
|
||||
if start_token is None or end_token is None:
|
||||
raise ValueError(Errors.E981)
|
||||
for i in range(start_token, end_token+1):
|
||||
ent_kb_ids[i] = true_kb_ids[0]
|
||||
|
||||
|
@ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities):
|
|||
|
||||
def _guess_spaces(text, words):
|
||||
if text is None:
|
||||
return [True] * len(words)
|
||||
return None
|
||||
spaces = []
|
||||
text_pos = 0
|
||||
# align words with text
|
||||
|
|
|
@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
|
|||
assert new_doc.is_parsed
|
||||
|
||||
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
en_texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
de_text = "Wie war die Frage?"
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
docs_idx = en_texts[0].index('docs')
|
||||
de_doc = de_tokenizer(de_text)
|
||||
en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
|
||||
|
||||
assert Doc.from_docs([]) is None
|
||||
|
||||
assert de_doc is not Doc.from_docs([de_doc])
|
||||
assert str(de_doc) == str(Doc.from_docs([de_doc]))
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
Doc.from_docs(en_docs + [de_doc])
|
||||
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert len(en_docs) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == " ".join(en_texts)
|
||||
p_token = m_doc[len(en_docs[0])-1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
|
||||
assert m_doc[9].idx == think_idx
|
||||
with pytest.raises(AttributeError):
|
||||
not_available = m_doc[2]._.is_ambiguous # not callable, because it was not set via set_extension
|
||||
assert len(m_doc.user_data) == len(en_docs[0].user_data) # but it's there
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||
assert len(en_docs) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == "".join(en_texts)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
|
||||
assert m_doc[9].idx == think_idx
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
|
||||
with pytest.raises(ValueError): # important attributes from sentenziser or parser are missing
|
||||
assert list(m_doc.sents)
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == " ".join(en_texts) # space delimiter considered, although spacy attribute was missing
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
assert len(m_doc) == len(en_docs_tokens)
|
||||
think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
|
||||
assert m_doc[9].idx == think_idx
|
||||
|
||||
|
||||
def test_doc_lang(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
assert doc.lang_ == "en"
|
||||
|
|
|
@ -75,3 +75,19 @@ def test_serialize_doc_bin():
|
|||
for i, doc in enumerate(reloaded_docs):
|
||||
assert doc.text == texts[i]
|
||||
assert doc.cats == cats
|
||||
|
||||
|
||||
def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["that", "'s"])
|
||||
assert doc1.has_unknown_spaces
|
||||
assert doc1.text == "that 's "
|
||||
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
|
||||
assert not doc2.has_unknown_spaces
|
||||
assert doc2.text == "that's"
|
||||
|
||||
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
|
||||
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
|
||||
assert re_doc1.has_unknown_spaces
|
||||
assert re_doc1.text == "that 's "
|
||||
assert not re_doc2.has_unknown_spaces
|
||||
assert re_doc2.text == "that's"
|
||||
|
|
|
@ -1,14 +1,10 @@
|
|||
import pytest
|
||||
|
||||
from spacy.gold import docs_to_json
|
||||
from spacy.gold.converters import iob2docs, conll_ner2docs
|
||||
from spacy.gold.converters.conllu2json import conllu2json
|
||||
from spacy.gold import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.cli.pretrain import make_docs
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters import conllu2docs
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json():
|
||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||
|
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
|
|||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted = conllu2json(input_data, n_sents=1)
|
||||
assert len(converted) == 1
|
||||
converted_docs = conllu2docs(input_data, n_sents=1)
|
||||
assert len(converted_docs) == 1
|
||||
converted = [docs_to_json(converted_docs)]
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||
|
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
|
|||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
|
||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0]
|
||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
|
||||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
|
|||
)
|
||||
def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||
input_data = "\n".join(lines)
|
||||
converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
||||
assert len(converted) == 1
|
||||
converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
|
||||
assert len(converted_docs) == 1
|
||||
converted = [docs_to_json(converted_docs)]
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
|
||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
|
||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
||||
assert len(sent["tokens"]) == 5
|
||||
|
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
|||
assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
|
||||
assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
|
||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
|
||||
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json_subtokens():
|
||||
|
@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
|
|||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted = conllu2json(
|
||||
converted_docs = conllu2docs(
|
||||
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
||||
)
|
||||
assert len(converted) == 1
|
||||
assert len(converted_docs) == 1
|
||||
converted = [docs_to_json(converted_docs)]
|
||||
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
|
||||
assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
|
||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
|
||||
sent = converted[0]["paragraphs"][0]["sentences"][0]
|
||||
assert len(sent["tokens"]) == 4
|
||||
|
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
|
|||
assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
|
||||
assert [t["head"] for t in tokens] == [1, 1, 0, -1]
|
||||
assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
|
||||
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
|
||||
ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_iob2json(en_vocab):
|
||||
|
|
|
@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
|
|||
[
|
||||
{
|
||||
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||
"links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||
}
|
||||
],
|
||||
)
|
||||
|
|
|
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
|
|||
from ..errors import Errors
|
||||
|
||||
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
|
||||
|
||||
class DocBin(object):
|
||||
|
@ -31,6 +31,7 @@ class DocBin(object):
|
|||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
|
||||
"strings": List[unicode] # List of unique strings in the token data
|
||||
"version": str, # DocBin version number
|
||||
}
|
||||
|
||||
Strings for the words, tags, labels etc are represented by 64-bit hashes in
|
||||
|
@ -53,12 +54,14 @@ class DocBin(object):
|
|||
DOCS: https://spacy.io/api/docbin#init
|
||||
"""
|
||||
attrs = sorted([intify_attr(attr) for attr in attrs])
|
||||
self.version = "0.1"
|
||||
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
|
||||
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
|
||||
self.tokens = []
|
||||
self.spaces = []
|
||||
self.cats = []
|
||||
self.user_data = []
|
||||
self.flags = []
|
||||
self.strings = set()
|
||||
self.store_user_data = store_user_data
|
||||
for doc in docs:
|
||||
|
@ -83,12 +86,17 @@ class DocBin(object):
|
|||
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||
self.flags.append({
|
||||
"has_unknown_spaces": doc.has_unknown_spaces
|
||||
})
|
||||
for token in doc:
|
||||
self.strings.add(token.text)
|
||||
self.strings.add(token.tag_)
|
||||
self.strings.add(token.lemma_)
|
||||
self.strings.add(token.morph_)
|
||||
self.strings.add(token.dep_)
|
||||
self.strings.add(token.ent_type_)
|
||||
self.strings.add(token.ent_kb_id_)
|
||||
self.cats.append(doc.cats)
|
||||
if self.store_user_data:
|
||||
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
|
||||
|
@ -105,8 +113,11 @@ class DocBin(object):
|
|||
vocab[string]
|
||||
orth_col = self.attrs.index(ORTH)
|
||||
for i in range(len(self.tokens)):
|
||||
flags = self.flags[i]
|
||||
tokens = self.tokens[i]
|
||||
spaces = self.spaces[i]
|
||||
if flags.get("has_unknown_spaces"):
|
||||
spaces = None
|
||||
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
||||
doc = doc.from_array(self.attrs, tokens)
|
||||
doc.cats = self.cats[i]
|
||||
|
@ -130,6 +141,7 @@ class DocBin(object):
|
|||
self.spaces.extend(other.spaces)
|
||||
self.strings.update(other.strings)
|
||||
self.cats.extend(other.cats)
|
||||
self.flags.extend(other.flags)
|
||||
if self.store_user_data:
|
||||
self.user_data.extend(other.user_data)
|
||||
|
||||
|
@ -147,12 +159,14 @@ class DocBin(object):
|
|||
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
||||
|
||||
msg = {
|
||||
"version": self.version,
|
||||
"attrs": self.attrs,
|
||||
"tokens": tokens.tobytes("C"),
|
||||
"spaces": spaces.tobytes("C"),
|
||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||
"strings": list(self.strings),
|
||||
"cats": self.cats,
|
||||
"flags": self.flags,
|
||||
}
|
||||
if self.store_user_data:
|
||||
msg["user_data"] = self.user_data
|
||||
|
@ -178,6 +192,7 @@ class DocBin(object):
|
|||
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||
self.cats = msg["cats"]
|
||||
self.flags = msg.get("flags", [{} for _ in lengths])
|
||||
if self.store_user_data and "user_data" in msg:
|
||||
self.user_data = list(msg["user_data"])
|
||||
for tokens in self.tokens:
|
||||
|
|
|
@ -59,11 +59,14 @@ cdef class Doc:
|
|||
cdef public dict user_token_hooks
|
||||
cdef public dict user_span_hooks
|
||||
|
||||
cdef public bint has_unknown_spaces
|
||||
|
||||
cdef public list _py_tokens
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
||||
|
||||
cdef public object noun_chunks_iterator
|
||||
|
||||
cdef object __weakref__
|
||||
|
|
|
@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
|
|||
from libc.math cimport sqrt
|
||||
from libc.stdint cimport int32_t, uint64_t
|
||||
|
||||
import copy
|
||||
from collections import Counter
|
||||
import numpy
|
||||
import numpy.linalg
|
||||
|
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
|
|||
from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
|
||||
from ..attrs import intify_attrs, IDS
|
||||
from ..attrs import intify_attr, intify_attrs, IDS
|
||||
from ..util import normalize_slice
|
||||
from ..compat import copy_reg, pickle
|
||||
from ..errors import Errors, Warnings
|
||||
|
@ -171,8 +172,7 @@ cdef class Doc:
|
|||
raise ValueError(Errors.E046.format(name=name))
|
||||
return Underscore.doc_extensions.pop(name)
|
||||
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
|
||||
orths_and_spaces=None):
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
|
@ -214,27 +214,24 @@ cdef class Doc:
|
|||
self._vector = None
|
||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||
cdef bint has_space
|
||||
if orths_and_spaces is None and words is not None:
|
||||
if spaces is None:
|
||||
spaces = [True] * len(words)
|
||||
elif len(spaces) != len(words):
|
||||
raise ValueError(Errors.E027)
|
||||
orths_and_spaces = zip(words, spaces)
|
||||
cdef const LexemeC* lexeme
|
||||
if orths_and_spaces is not None:
|
||||
orths_and_spaces = list(orths_and_spaces)
|
||||
for orth_space in orths_and_spaces:
|
||||
if isinstance(orth_space, unicode):
|
||||
lexeme = self.vocab.get(self.mem, orth_space)
|
||||
has_space = True
|
||||
elif isinstance(orth_space, bytes):
|
||||
raise ValueError(Errors.E028.format(value=orth_space))
|
||||
elif isinstance(orth_space[0], unicode):
|
||||
lexeme = self.vocab.get(self.mem, orth_space[0])
|
||||
has_space = orth_space[1]
|
||||
if words is None and spaces is not None:
|
||||
raise ValueError("words must be set if spaces is set")
|
||||
elif spaces is None and words is not None:
|
||||
self.has_unknown_spaces = True
|
||||
else:
|
||||
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
|
||||
has_space = orth_space[1]
|
||||
self.has_unknown_spaces = False
|
||||
words = words if words is not None else []
|
||||
spaces = spaces if spaces is not None else ([True] * len(words))
|
||||
if len(spaces) != len(words):
|
||||
raise ValueError(Errors.E027)
|
||||
cdef const LexemeC* lexeme
|
||||
for word, has_space in zip(words, spaces):
|
||||
if isinstance(word, unicode):
|
||||
lexeme = self.vocab.get(self.mem, word)
|
||||
elif isinstance(word, bytes):
|
||||
raise ValueError(Errors.E028.format(value=word))
|
||||
else:
|
||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||
self.push_back(lexeme, has_space)
|
||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
||||
# There's no information we'd like to add to it, so I guess so?
|
||||
|
@ -806,7 +803,7 @@ cdef class Doc:
|
|||
attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||
for id_ in attrs]
|
||||
if array.dtype != numpy.uint64:
|
||||
warnings.warn(Warnings.W028.format(type=array.dtype))
|
||||
warnings.warn(Warnings.W101.format(type=array.dtype))
|
||||
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(Errors.E032)
|
||||
|
@ -882,6 +879,87 @@ cdef class Doc:
|
|||
set_children_from_heads(self.c, length)
|
||||
return self
|
||||
|
||||
@staticmethod
|
||||
def from_docs(docs, ensure_whitespace=True, attrs=None):
|
||||
"""Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
|
||||
the same `Vocab`.
|
||||
|
||||
docs (list): A list of Doc objects.
|
||||
ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
|
||||
attrs (list): Optional list of attribute ID ints or attribute name strings.
|
||||
RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#from_docs
|
||||
"""
|
||||
if not docs:
|
||||
return None
|
||||
|
||||
vocab = {doc.vocab for doc in docs}
|
||||
if len(vocab) > 1:
|
||||
raise ValueError(Errors.E999)
|
||||
(vocab,) = vocab
|
||||
|
||||
if attrs is None:
|
||||
attrs = [LEMMA, NORM]
|
||||
if all(doc.is_nered for doc in docs):
|
||||
attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
|
||||
# TODO: separate for is_morphed?
|
||||
if all(doc.is_tagged for doc in docs):
|
||||
attrs.extend([TAG, POS, MORPH])
|
||||
if all(doc.is_parsed for doc in docs):
|
||||
attrs.extend([HEAD, DEP])
|
||||
else:
|
||||
attrs.append(SENT_START)
|
||||
else:
|
||||
if any(isinstance(attr, str) for attr in attrs): # resolve attribute names
|
||||
attrs = [intify_attr(attr) for attr in attrs] # intify_attr returns None for invalid attrs
|
||||
attrs = list(attr for attr in set(attrs) if attr) # filter duplicates, remove None if present
|
||||
if SPACY not in attrs:
|
||||
attrs.append(SPACY)
|
||||
|
||||
concat_words = []
|
||||
concat_spaces = []
|
||||
concat_user_data = {}
|
||||
char_offset = 0
|
||||
for doc in docs:
|
||||
concat_words.extend(t.text for t in doc)
|
||||
concat_spaces.extend(bool(t.whitespace_) for t in doc)
|
||||
|
||||
for key, value in doc.user_data.items():
|
||||
if isinstance(key, tuple) and len(key) == 4:
|
||||
data_type, name, start, end = key
|
||||
if start is not None or end is not None:
|
||||
start += char_offset
|
||||
if end is not None:
|
||||
end += char_offset
|
||||
concat_user_data[(data_type, name, start, end)] = copy.copy(value)
|
||||
else:
|
||||
warnings.warn(Warnings.W101.format(name=name))
|
||||
else:
|
||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||
char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
|
||||
|
||||
arrays = [doc.to_array(attrs) for doc in docs]
|
||||
|
||||
if ensure_whitespace:
|
||||
spacy_index = attrs.index(SPACY)
|
||||
for i, array in enumerate(arrays[:-1]):
|
||||
if len(array) > 0 and not docs[i][-1].is_space:
|
||||
array[-1][spacy_index] = 1
|
||||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not doc[-1].is_space:
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
||||
concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
|
||||
|
||||
concat_doc.from_array(attrs, concat_array)
|
||||
|
||||
return concat_doc
|
||||
|
||||
def get_lca_matrix(self):
|
||||
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
|
||||
`Doc`, where LCA[i, j] is the index of the lowest common ancestor among
|
||||
|
@ -1000,6 +1078,7 @@ cdef class Doc:
|
|||
"sentiment": lambda: self.sentiment,
|
||||
"tensor": lambda: self.tensor,
|
||||
"cats": lambda: self.cats,
|
||||
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||
|
@ -1032,6 +1111,7 @@ cdef class Doc:
|
|||
"cats": lambda b: None,
|
||||
"user_data_keys": lambda b: None,
|
||||
"user_data_values": lambda b: None,
|
||||
"has_unknown_spaces": lambda b: None
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in deserializers or key in ("user_data",):
|
||||
|
@ -1052,6 +1132,8 @@ cdef class Doc:
|
|||
self.tensor = msg["tensor"]
|
||||
if "cats" not in exclude and "cats" in msg:
|
||||
self.cats = msg["cats"]
|
||||
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
|
||||
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
||||
start = 0
|
||||
cdef const LexemeC* lex
|
||||
cdef unicode orth_
|
||||
|
@ -1123,50 +1205,6 @@ cdef class Doc:
|
|||
remove_label_if_necessary(attributes[i])
|
||||
retokenizer.merge(span, attributes[i])
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Retokenize the document, such that the span at
|
||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||
the document remains unchanged.
|
||||
|
||||
start_idx (int): Character index of the start of the slice to merge.
|
||||
end_idx (int): Character index after the end of the slice to merge.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
|
||||
# TODO: ENT_KB_ID ?
|
||||
if len(args) == 3:
|
||||
warnings.warn(Warnings.W003, DeprecationWarning)
|
||||
tag, lemma, ent_type = args
|
||||
attributes[TAG] = tag
|
||||
attributes[LEMMA] = lemma
|
||||
attributes[ENT_TYPE] = ent_type
|
||||
elif not args:
|
||||
fix_attributes(self, attributes)
|
||||
elif args:
|
||||
raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
|
||||
kwargs=repr(attributes)))
|
||||
remove_label_if_necessary(attributes)
|
||||
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
return None
|
||||
cdef int end = token_by_end(self.c, self.length, end_idx)
|
||||
if end == -1:
|
||||
return None
|
||||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
with self.retokenize() as retokenizer:
|
||||
retokenizer.merge(self[start:end], attrs=attributes)
|
||||
return self[start]
|
||||
|
||||
def print_tree(self, light=False, flat=False):
|
||||
raise ValueError(Errors.E105)
|
||||
|
||||
def to_json(self, underscore=None):
|
||||
"""Convert a Doc to JSON. The format it produces will be the new format
|
||||
for the `spacy train` command (not implemented yet).
|
||||
|
|
|
@ -280,18 +280,6 @@ cdef class Span:
|
|||
|
||||
return array
|
||||
|
||||
def merge(self, *args, **attributes):
|
||||
"""Retokenize the document, such that the span is merged into a single
|
||||
token.
|
||||
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token.
|
||||
"""
|
||||
warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
|
||||
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||
**attributes)
|
||||
|
||||
def get_lca_matrix(self):
|
||||
"""Calculates a matrix of Lowest Common Ancestors (LCA) for a given
|
||||
`Span`, where LCA[i, j] is the index of the lowest common ancestor among
|
||||
|
|
|
@ -349,6 +349,33 @@ array of attributes.
|
|||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | Itself. |
|
||||
|
||||
|
||||
## Doc.from_docs {#from_docs tag="staticmethod"}
|
||||
|
||||
Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import Doc
|
||||
> texts = ["London is the capital of the United Kingdom.",
|
||||
> "The River Thames flows through London.",
|
||||
> "The famous Tower Bridge crosses the River Thames."]
|
||||
> docs = list(nlp.pipe(texts))
|
||||
> c_doc = Doc.from_docs(docs)
|
||||
> assert str(c_doc) == " ".join(texts)
|
||||
> assert len(list(c_doc.sents)) == len(docs)
|
||||
> assert [str(ent) for ent in c_doc.ents] == \
|
||||
> [str(ent) for doc in docs for ent in doc.ents]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
|
||||
| `docs` | list | A list of `Doc` objects. |
|
||||
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. |
|
||||
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. |
|
||||
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
|
||||
|
||||
## Doc.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
Save the current state to a directory.
|
||||
|
|
|
@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
|
|||
the msgpack object has the following structure:
|
||||
|
||||
```python
|
||||
### msgpack object strcutrue
|
||||
### msgpack object structrue
|
||||
{
|
||||
"version": str, # DocBin version number
|
||||
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
|
||||
"tokens": bytes, # Serialized numpy uint64 array with the token data
|
||||
"spaces": bytes, # Serialized numpy boolean array with spaces data
|
||||
|
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
|
|||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
||||
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
||||
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||
|
||||
|
|
|
@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
|
|||
|
||||
## Matcher.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Find all token sequences matching the supplied patterns on the `Doc`. As of
|
||||
spaCy v2.3, the `Matcher` can also be called on `Span` objects.
|
||||
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
|
|||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
> matcher.add("HelloWorld", None, pattern)
|
||||
> matcher.add("HelloWorld", [pattern])
|
||||
> doc = nlp("hello world!")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
By default, the matcher **does not perform any action** on matches, like tagging
|
||||
matched phrases with entity types. Instead, actions need to be specified when
|
||||
**adding patterns or entities**, by passing in a callback function as the
|
||||
`on_match` argument on [`add`](/api/matcher#add). This allows you to define
|
||||
custom actions per pattern within the same matcher. For example, you might only
|
||||
want to merge some entity types, and set custom flags for other matched
|
||||
patterns. For more details and examples, see the usage guide on
|
||||
[rule-based matching](/usage/rule-based-matching).
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
@ -92,7 +78,7 @@ patterns.
|
|||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
|
@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
|
|||
>
|
||||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert 'Rule' not in matcher
|
||||
> matcher.add('Rule', None, [{'ORTH': 'test'}])
|
||||
> assert 'Rule' in matcher
|
||||
> assert "Rule" not in matcher
|
||||
> matcher.add("Rule", [[{'ORTH': 'test'}]])
|
||||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
|
@ -133,35 +119,35 @@ overwritten.
|
|||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
|
||||
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
|
||||
> patterns = [
|
||||
> [{"LOWER": "hello"}, {"LOWER": "world"}],
|
||||
> [{"ORTH": "Google"}, {"ORTH": "Maps"}]
|
||||
> ]
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> doc = nlp("HELLO WORLD on Google Maps.")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||
|
||||
As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
|
||||
the default in the future. The patterns are now the second argument and a list
|
||||
As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
|
||||
(instead of a variable number of arguments). The `on_match` callback becomes an
|
||||
optional keyword argument.
|
||||
|
||||
```diff
|
||||
patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
||||
- matcher.add("GoogleNow", None, *patterns)
|
||||
+ matcher.add("GoogleNow", patterns)
|
||||
- matcher.add("GoogleNow", on_match, *patterns)
|
||||
+ matcher.add("GoogleNow", patterns, on_match=on_match)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## Matcher.remove {#remove tag="method" new="2"}
|
||||
|
||||
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
|
||||
|
@ -170,7 +156,7 @@ exist.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> assert "Rule" in matcher
|
||||
> matcher.remove("Rule")
|
||||
> assert "Rule" not in matcher
|
||||
|
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", None, [{"ORTH": "test"}])
|
||||
> matcher.add("Rule", [[{"ORTH": "test"}]])
|
||||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> from spacy.matcher import PhraseMatcher
|
||||
>
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
@ -104,7 +104,7 @@ patterns.
|
|||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
|
@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
|
|||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> assert "OBAMA" not in matcher
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert "OBAMA" in matcher
|
||||
> ```
|
||||
|
||||
|
@ -145,36 +145,32 @@ overwritten.
|
|||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
|
||||
> matcher.add("HEALTH", on_match, nlp("health care reform"),
|
||||
> nlp("healthcare reform"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
|
||||
> matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
|
||||
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
<Infobox title="Changed in v2.2.2" variant="warning">
|
||||
|
||||
As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
|
||||
become the default in the future. The `Doc` patterns are now the second argument
|
||||
and a list (instead of a variable number of arguments). The `on_match` callback
|
||||
As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
|
||||
argument (instead of a variable number of arguments). The `on_match` callback
|
||||
becomes an optional keyword argument.
|
||||
|
||||
```diff
|
||||
patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||
- matcher.add("HEALTH", None, *patterns)
|
||||
+ matcher.add("HEALTH", patterns)
|
||||
- matcher.add("HEALTH", on_match, *patterns)
|
||||
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `docs` | list | `Doc` objects of the phrases to match. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
||||
|
||||
Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
|
||||
|
@ -184,7 +180,7 @@ does not exist.
|
|||
>
|
||||
> ```python
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> matcher.add("OBAMA", None, nlp("Barack Obama"))
|
||||
> matcher.add("OBAMA", [nlp("Barack Obama")])
|
||||
> assert "OBAMA" in matcher
|
||||
> matcher.remove("OBAMA")
|
||||
> assert "OBAMA" not in matcher
|
||||
|
|
|
@ -407,7 +407,7 @@ class EntityMatcher(object):
|
|||
def __init__(self, nlp, terms, label):
|
||||
patterns = [nlp.make_doc(text) for text in terms]
|
||||
self.matcher = PhraseMatcher(nlp.vocab)
|
||||
self.matcher.add(label, None, *patterns)
|
||||
self.matcher.add(label, patterns)
|
||||
|
||||
def __call__(self, doc):
|
||||
matches = self.matcher(doc)
|
||||
|
|
|
@ -98,9 +98,7 @@ print([token.text for token in doc])
|
|||
|
||||
First, we initialize the `Matcher` with a vocab. The matcher must always share
|
||||
the same vocab with the documents it will operate on. We can now call
|
||||
[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
|
||||
second argument lets you pass in an optional callback function to invoke on a
|
||||
successful match. For now, we set it to `None`.
|
||||
[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab)
|
||||
# Add match ID "HelloWorld" with no callback and one pattern
|
||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
|
||||
matcher.add("HelloWorld", None, pattern)
|
||||
matcher.add("HelloWorld", [pattern])
|
||||
|
||||
doc = nlp("Hello, world! Hello world!")
|
||||
matches = matcher(doc)
|
||||
|
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
|
|||
also match sequences without punctuation between "hello" and "world":
|
||||
|
||||
```python
|
||||
matcher.add("HelloWorld", None,
|
||||
patterns = [
|
||||
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
|
||||
[{"LOWER": "hello"}, {"LOWER": "world"}])
|
||||
[{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
]
|
||||
matcher.add("HelloWorld", patterns)
|
||||
```
|
||||
|
||||
By default, the matcher will only return the matches and **not do anything
|
||||
|
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab, validate=True)
|
||||
# Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
|
||||
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
|
||||
matcher.add("HelloWorld", None, pattern)
|
||||
matcher.add("HelloWorld", [pattern])
|
||||
# 🚨 Raises an error:
|
||||
# MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
|
||||
# Pattern 0:
|
||||
|
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
|
|||
print(entity.text)
|
||||
|
||||
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||
matcher.add("GoogleIO", add_event_ent, pattern)
|
||||
matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
|
||||
doc = nlp("This is a text about Google I/O")
|
||||
matches = matcher(doc)
|
||||
```
|
||||
|
@ -509,19 +509,18 @@ import spacy
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Token
|
||||
|
||||
# We're using a class because the component needs to be initialised with
|
||||
# We're using a class because the component needs to be initialized with
|
||||
# the shared vocab via the nlp object
|
||||
class BadHTMLMerger(object):
|
||||
def __init__(self, nlp):
|
||||
patterns = [
|
||||
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
|
||||
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
|
||||
]
|
||||
# Register a new token extension to flag bad HTML
|
||||
Token.set_extension("bad_html", default=False)
|
||||
self.matcher = Matcher(nlp.vocab)
|
||||
self.matcher.add(
|
||||
"BAD_HTML",
|
||||
None,
|
||||
[{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
|
||||
[{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
|
||||
)
|
||||
self.matcher.add("BAD_HTML", patterns)
|
||||
|
||||
def __call__(self, doc):
|
||||
# This method is invoked when the component is called on a Doc
|
||||
|
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
|
|||
|
||||
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
|
||||
{"POS": "ADJ"}]
|
||||
matcher.add("FacebookIs", collect_sents, pattern) # add pattern
|
||||
matcher.add("FacebookIs", [pattern], on_match=collect_sents) # add pattern
|
||||
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
|
||||
matches = matcher(doc)
|
||||
|
||||
|
@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
|
|||
matcher = Matcher(nlp.vocab)
|
||||
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
|
||||
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
|
||||
matcher.add("PHONE_NUMBER", None, pattern)
|
||||
matcher.add("PHONE_NUMBER", [pattern])
|
||||
|
||||
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
|
||||
print([t.text for t in doc])
|
||||
|
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
|
|||
elif doc.vocab.strings[match_id] == "SAD":
|
||||
doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment
|
||||
|
||||
matcher.add("HAPPY", label_sentiment, *pos_patterns) # Add positive pattern
|
||||
matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
|
||||
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern
|
||||
matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern
|
||||
|
||||
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
|
||||
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
|
||||
matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
|
||||
|
||||
doc = nlp("Hello world 😀 #MondayMotivation")
|
||||
matches = matcher(doc)
|
||||
|
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
|
|||
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
|
||||
# Only run nlp.make_doc to speed things up
|
||||
patterns = [nlp.make_doc(text) for text in terms]
|
||||
matcher.add("TerminologyList", None, *patterns)
|
||||
matcher.add("TerminologyList", patterns)
|
||||
|
||||
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
|
||||
"converse in the Oval Office inside the White House in Washington, D.C.")
|
||||
|
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
|
|||
nlp = English()
|
||||
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
|
||||
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
|
||||
matcher.add("Names", None, *patterns)
|
||||
matcher.add("Names", patterns)
|
||||
|
||||
doc = nlp("angela merkel and us president barack Obama")
|
||||
for match_id, start, end in matcher(doc):
|
||||
|
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
|
|||
|
||||
nlp = English()
|
||||
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
|
||||
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
|
||||
matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
|
||||
|
||||
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
|
||||
for match_id, start, end in matcher(doc):
|
||||
|
|
|
@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
|
|||
def set_sentiment(matcher, doc, i, matches):
|
||||
doc.sentiment += 0.1
|
||||
|
||||
pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
|
||||
pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
|
||||
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
|
||||
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
|
||||
pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
|
||||
patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
|
||||
matcher.add("GoogleIO", patterns1) # Match "Google I/O" or "Google i/o"
|
||||
matcher.add("HAPPY", patterns2, on_match=set_sentiment) # Match one or more happy emoji
|
||||
|
||||
doc = nlp("A text about Google I/O 😀😀")
|
||||
matches = matcher(doc)
|
||||
|
|
Loading…
Reference in New Issue
Block a user