mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
a66e8e0c8a
|
@ -112,10 +112,10 @@ def write_conllu(docs, file_):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
with doc.retokenize() as retokenizer:
|
||||||
for start_char, end_char in offsets:
|
for span in spans:
|
||||||
doc.merge(start_char, end_char)
|
retokenizer.merge(span)
|
||||||
# TODO: This shuldn't be necessary? Should be handled in merge
|
# TODO: This shouldn't be necessary? Should be handled in merge
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i == word.head.i:
|
if word.i == word.head.i:
|
||||||
word.dep_ = "ROOT"
|
word.dep_ = "ROOT"
|
||||||
|
|
|
@ -217,9 +217,9 @@ def write_conllu(docs, file_):
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
with doc.retokenize() as retokenizer:
|
||||||
for start_char, end_char in offsets:
|
for span in spans:
|
||||||
doc.merge(start_char, end_char)
|
retokenizer.merge(span)
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
|
|
|
@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}):
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
user_warning(Warnings.W005)
|
user_warning(Warnings.W005)
|
||||||
if options.get("collapse_phrases", False):
|
if options.get("collapse_phrases", False):
|
||||||
for np in list(doc.noun_chunks):
|
with doc.retokenize() as retokenizer:
|
||||||
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
|
for np in list(doc.noun_chunks):
|
||||||
|
attrs = {
|
||||||
|
"tag": np.root.tag_,
|
||||||
|
"lemma": np.root.lemma_,
|
||||||
|
"ent_type": np.root.ent_type_,
|
||||||
|
}
|
||||||
|
retokenizer.merge(np, attrs=attrs)
|
||||||
if options.get("collapse_punct", True):
|
if options.get("collapse_punct", True):
|
||||||
spans = []
|
spans = []
|
||||||
for word in doc[:-1]:
|
for word in doc[:-1]:
|
||||||
|
@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}):
|
||||||
while end < len(doc) and doc[end].is_punct:
|
while end < len(doc) and doc[end].is_punct:
|
||||||
end += 1
|
end += 1
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
spans.append(
|
spans.append((span, word.tag_, word.lemma_, word.ent_type_))
|
||||||
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
|
with doc.retokenize() as retokenizer:
|
||||||
)
|
for span, tag, lemma, ent_type in spans:
|
||||||
for start, end, tag, lemma, ent_type in spans:
|
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
|
||||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
if options.get("fine_grained"):
|
if options.get("fine_grained"):
|
||||||
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
words = [{"text": w.text, "tag": w.tag_} for w in doc]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -67,6 +67,9 @@ class Warnings(object):
|
||||||
"components are applied. To only create tokenized Doc objects, "
|
"components are applied. To only create tokenized Doc objects, "
|
||||||
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
||||||
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
||||||
|
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
||||||
|
"efficient and less error-prone Doc.retokenize context manager "
|
||||||
|
"instead.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import HYPHENS
|
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||||
from .char_classes import CURRENCY, UNITS
|
|
||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
["§", "%", "=", r"\+(?![0-9])"]
|
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
||||||
+ LIST_PUNCT
|
+ LIST_PUNCT
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
|
@ -22,13 +21,15 @@ _suffixes = (
|
||||||
+ LIST_ELLIPSES
|
+ LIST_ELLIPSES
|
||||||
+ LIST_QUOTES
|
+ LIST_QUOTES
|
||||||
+ LIST_ICONS
|
+ LIST_ICONS
|
||||||
+ ["'s", "'S", "’s", "’S"]
|
+ ["'s", "'S", "’s", "’S", "—", "–"]
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
|
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
@ -40,8 +41,8 @@ _infixes = (
|
||||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||||
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,6 @@ from __future__ import unicode_literals
|
||||||
from ..matcher import Matcher
|
from ..matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
# TODO: replace doc.merge with doc.retokenize
|
|
||||||
|
|
||||||
|
|
||||||
def merge_noun_chunks(doc):
|
def merge_noun_chunks(doc):
|
||||||
"""Merge noun chunks into a single token.
|
"""Merge noun chunks into a single token.
|
||||||
|
|
||||||
|
@ -15,11 +12,10 @@ def merge_noun_chunks(doc):
|
||||||
"""
|
"""
|
||||||
if not doc.is_parsed:
|
if not doc.is_parsed:
|
||||||
return doc
|
return doc
|
||||||
spans = [
|
with doc.retokenize() as retokenizer:
|
||||||
(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
|
for np in doc.noun_chunks:
|
||||||
]
|
attrs = {"tag": np.root.tag, "dep": np.root.dep}
|
||||||
for start, end, tag, dep in spans:
|
retokenizer.merge(np, attrs=attrs)
|
||||||
doc.merge(start, end, tag=tag, dep=dep)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,11 +25,10 @@ def merge_entities(doc):
|
||||||
doc (Doc): The Doc object.
|
doc (Doc): The Doc object.
|
||||||
RETURNS (Doc): The Doc object with merged noun entities.
|
RETURNS (Doc): The Doc object with merged noun entities.
|
||||||
"""
|
"""
|
||||||
spans = [
|
with doc.retokenize() as retokenizer:
|
||||||
(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
|
for ent in doc.ents:
|
||||||
]
|
attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
|
||||||
for start, end, tag, dep, ent_type in spans:
|
retokenizer.merge(ent, attrs=attrs)
|
||||||
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"):
|
||||||
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
|
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
with doc.retokenize() as retokenizer:
|
||||||
for start_char, end_char in offsets:
|
for span in spans:
|
||||||
doc.merge(start_char, end_char)
|
retokenizer.merge(span)
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_api_merge(en_tokenizer):
|
def test_doc_api_merge(en_tokenizer):
|
||||||
text = "WKRO played songs by the beach boys all night"
|
text = "WKRO played songs by the beach boys all night"
|
||||||
|
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||||
# merge 'The Beach Boys'
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
assert len(doc) == 9
|
|
||||||
doc.merge(
|
|
||||||
doc[4].idx,
|
|
||||||
doc[6].idx + len(doc[6]),
|
|
||||||
tag="NAMED",
|
|
||||||
lemma="LEMMA",
|
|
||||||
ent_type="TYPE",
|
|
||||||
)
|
|
||||||
assert len(doc) == 7
|
|
||||||
assert doc[4].text == "the beach boys"
|
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
|
||||||
assert doc[4].tag_ == "NAMED"
|
|
||||||
|
|
||||||
# merge 'all night'
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
assert len(doc) == 9
|
|
||||||
doc.merge(
|
|
||||||
doc[7].idx,
|
|
||||||
doc[8].idx + len(doc[8]),
|
|
||||||
tag="NAMED",
|
|
||||||
lemma="LEMMA",
|
|
||||||
ent_type="TYPE",
|
|
||||||
)
|
|
||||||
assert len(doc) == 8
|
|
||||||
assert doc[7].text == "all night"
|
|
||||||
assert doc[7].text_with_ws == "all night"
|
|
||||||
|
|
||||||
# merge both with bulk merge
|
# merge both with bulk merge
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(
|
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||||
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
retokenizer.merge(doc[7:9], attrs=attrs)
|
||||||
)
|
|
||||||
retokenizer.merge(
|
|
||||||
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(doc) == 6
|
|
||||||
assert doc[4].text == "the beach boys"
|
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
|
||||||
assert doc[4].tag_ == "NAMED"
|
|
||||||
assert doc[5].text == "all night"
|
|
||||||
assert doc[5].text_with_ws == "all night"
|
|
||||||
assert doc[5].tag_ == "NAMED"
|
|
||||||
|
|
||||||
# merge both with bulk merge
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
assert len(doc) == 9
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
retokenizer.merge(
|
|
||||||
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
|
||||||
)
|
|
||||||
retokenizer.merge(
|
|
||||||
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(doc) == 6
|
assert len(doc) == 6
|
||||||
assert doc[4].text == "the beach boys"
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
|
@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer):
|
||||||
def test_doc_api_merge_children(en_tokenizer):
|
def test_doc_api_merge_children(en_tokenizer):
|
||||||
"""Test that attachments work correctly after merging."""
|
"""Test that attachments work correctly after merging."""
|
||||||
text = "WKRO played songs by the beach boys all night"
|
text = "WKRO played songs by the beach boys all night"
|
||||||
|
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
doc.merge(
|
with doc.retokenize() as retokenizer:
|
||||||
doc[4].idx,
|
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||||
doc[6].idx + len(doc[6]),
|
|
||||||
tag="NAMED",
|
|
||||||
lemma="LEMMA",
|
|
||||||
ent_type="TYPE",
|
|
||||||
)
|
|
||||||
|
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i < word.head.i:
|
if word.i < word.head.i:
|
||||||
assert word in list(word.head.lefts)
|
assert word in list(word.head.lefts)
|
||||||
|
@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer):
|
||||||
def test_doc_api_merge_hang(en_tokenizer):
|
def test_doc_api_merge_hang(en_tokenizer):
|
||||||
text = "through North and South Carolina"
|
text = "through North and South Carolina"
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
|
with doc.retokenize() as retokenizer:
|
||||||
doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
|
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
|
||||||
|
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_retokenizer(en_tokenizer):
|
def test_doc_api_retokenizer(en_tokenizer):
|
||||||
|
@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer):
|
||||||
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
||||||
"ROOT", "amod", "dobj"]
|
"ROOT", "amod", "dobj"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||||
|
|
||||||
nps = []
|
nps = []
|
||||||
for np in doc.noun_chunks:
|
for np in doc.noun_chunks:
|
||||||
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
||||||
np = np[1:]
|
np = np[1:]
|
||||||
if len(np) > 1:
|
if len(np) > 1:
|
||||||
nps.append(
|
nps.append(np)
|
||||||
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
|
with doc.retokenize() as retokenizer:
|
||||||
)
|
for np in nps:
|
||||||
for np in nps:
|
attrs = {
|
||||||
start, end, tag, lemma, ent_type = np
|
"tag": np.root.tag_,
|
||||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
"lemma": np.text,
|
||||||
|
"ent_type": np.root.ent_type_,
|
||||||
|
}
|
||||||
|
retokenizer.merge(np, attrs=attrs)
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_right_edge(en_tokenizer):
|
def test_doc_api_right_edge(en_tokenizer):
|
||||||
|
|
|
@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer):
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].head.text == "Angeles"
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[1].head.text == "start"
|
assert doc[1].head.text == "start"
|
||||||
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
|
with doc.retokenize() as retokenizer:
|
||||||
assert len(doc) == 3
|
attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
|
||||||
assert doc[0].text == "Los Angeles"
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
assert doc[0].head.text == "start"
|
|
||||||
|
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
|
||||||
assert len(doc) == 4
|
|
||||||
assert doc[0].head.text == "Angeles"
|
|
||||||
assert doc[1].head.text == "start"
|
|
||||||
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
|
|
||||||
|
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert doc[0].text == "Los Angeles"
|
assert doc[0].text == "Los Angeles"
|
||||||
assert doc[0].head.text == "start"
|
assert doc[0].head.text == "start"
|
||||||
|
@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer):
|
||||||
heads = [1, 0, 2, 1, -3, -1, -1, -1]
|
heads = [1, 0, 2, 1, -3, -1, -1, -1]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
|
|
||||||
assert doc[4].head.i == 1
|
assert doc[4].head.i == 1
|
||||||
doc.merge(
|
with doc.retokenize() as retokenizer:
|
||||||
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
|
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
|
||||||
)
|
retokenizer.merge(doc[2:5], attrs=attrs)
|
||||||
assert doc[2].head.i == 1
|
assert doc[2].head.i == 1
|
||||||
|
|
||||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||||
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
|
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
|
for ent in doc.ents:
|
||||||
for start, end, label, lemma in ents:
|
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
|
||||||
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
|
retokenizer.merge(ent, attrs=attrs)
|
||||||
assert merged is not None, (start, end, label, lemma)
|
|
||||||
|
|
||||||
text = "One test with entities like New York City so the ents list is not void"
|
text = "One test with entities like New York City so the ents list is not void"
|
||||||
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
for span in doc.ents:
|
with doc.retokenize() as retokenizer:
|
||||||
merged = doc.merge()
|
for ent in doc.ents:
|
||||||
assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
|
retokenizer.merge(ent)
|
||||||
|
|
||||||
|
|
||||||
def test_spans_entity_merge(en_tokenizer):
|
def test_spans_entity_merge(en_tokenizer):
|
||||||
|
@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer):
|
||||||
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
||||||
)
|
)
|
||||||
assert len(doc) == 17
|
assert len(doc) == 17
|
||||||
for ent in doc.ents:
|
with doc.retokenize() as retokenizer:
|
||||||
label, lemma, type_ = (
|
for ent in doc.ents:
|
||||||
ent.root.tag_,
|
ent_type = max(w.ent_type_ for w in ent)
|
||||||
ent.root.lemma_,
|
attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type}
|
||||||
max(w.ent_type_ for w in ent),
|
retokenizer.merge(ent, attrs=attrs)
|
||||||
)
|
|
||||||
ent.merge(label=label, lemma=lemma, ent_type=type_)
|
|
||||||
# check looping is ok
|
# check looping is ok
|
||||||
assert len(doc) == 15
|
assert len(doc) == 15
|
||||||
|
|
||||||
|
@ -132,7 +120,8 @@ def test_spans_entity_merge_iob():
|
||||||
assert doc[1].ent_iob_ == "I"
|
assert doc[1].ent_iob_ == "I"
|
||||||
assert doc[2].ent_iob_ == "I"
|
assert doc[2].ent_iob_ == "I"
|
||||||
assert doc[3].ent_iob_ == "B"
|
assert doc[3].ent_iob_ == "B"
|
||||||
doc[0:1].merge()
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[0:1])
|
||||||
assert doc[0].ent_iob_ == "B"
|
assert doc[0].ent_iob_ == "B"
|
||||||
assert doc[1].ent_iob_ == "I"
|
assert doc[1].ent_iob_ == "I"
|
||||||
|
|
||||||
|
@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
sent1, sent2 = list(doc.sents)
|
sent1, sent2 = list(doc.sents)
|
||||||
init_len = len(sent1)
|
init_len = len(sent1)
|
||||||
init_len2 = len(sent2)
|
init_len2 = len(sent2)
|
||||||
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
with doc.retokenize() as retokenizer:
|
||||||
doc[-2:].merge(label="none", lemma="none", ent_type="none")
|
attrs = {"lemma": "none", "ent_type": "none"}
|
||||||
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
|
retokenizer.merge(doc[-2:], attrs=attrs)
|
||||||
assert len(sent1) == init_len - 1
|
assert len(sent1) == init_len - 1
|
||||||
assert len(sent2) == init_len2 - 1
|
assert len(sent2) == init_len2 - 1
|
||||||
|
|
||||||
|
@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
sent1 = list(doc.sents)[0]
|
sent1 = list(doc.sents)[0]
|
||||||
init_len = len(list(sent1.root.subtree))
|
init_len = len(list(sent1.root.subtree))
|
||||||
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
with doc.retokenize() as retokenizer:
|
||||||
|
attrs = {"lemma": "none", "ent_type": "none"}
|
||||||
|
retokenizer.merge(doc[0:2], attrs=attrs)
|
||||||
assert len(list(sent1.root.subtree)) == init_len - 1
|
assert len(list(sent1.root.subtree)) == init_len - 1
|
||||||
|
|
|
@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
|
||||||
@pytest.mark.parametrize("lang", LANGUAGES)
|
@pytest.mark.parametrize("lang", LANGUAGES)
|
||||||
def test_lang_initialize(lang):
|
def test_lang_initialize(lang):
|
||||||
"""Test that languages can be initialized."""
|
"""Test that languages can be initialized."""
|
||||||
lang_cls = get_lang_class(lang)()
|
lang_cls = get_lang_class(lang)() # noqa: F841
|
||||||
|
|
|
@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab):
|
||||||
if doc.vocab.strings[match_id] == "HAPPY":
|
if doc.vocab.strings[match_id] == "HAPPY":
|
||||||
doc.sentiment += 0.1
|
doc.sentiment += 0.1
|
||||||
span = doc[start:end]
|
span = doc[start:end]
|
||||||
token = span.merge()
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(span)
|
||||||
|
token = doc[start]
|
||||||
token.vocab[token.text].norm_ = "happy emoji"
|
token.vocab[token.text].norm_ = "happy emoji"
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
|
@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer):
|
||||||
doc = get_doc(
|
doc = get_doc(
|
||||||
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
|
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
|
||||||
)
|
)
|
||||||
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
with doc.retokenize() as retokenizer:
|
||||||
for start, end, lemma in nps:
|
for np in doc.noun_chunks:
|
||||||
doc.merge(start, end, label="NP", lemma=lemma)
|
retokenizer.merge(np, attrs={"lemma": np.lemma_})
|
||||||
assert doc[0].text == "A phrase"
|
assert doc[0].text == "A phrase"
|
||||||
assert doc[1].text == "with"
|
assert doc[1].text == "with"
|
||||||
assert doc[2].text == "another phrase"
|
assert doc[2].text == "another phrase"
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, Span
|
||||||
|
|
||||||
from ..util import get_doc, make_tempdir
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
@ -204,12 +204,13 @@ def test_issue615(en_tokenizer):
|
||||||
on the last match."""
|
on the last match."""
|
||||||
if i != len(matches) - 1:
|
if i != len(matches) - 1:
|
||||||
return None
|
return None
|
||||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
spans = [Span(doc, start, end, label=label) for label, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
with doc.retokenize() as retokenizer:
|
||||||
span.merge(
|
for span in spans:
|
||||||
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
|
tag = "NNP" if span.label_ else span.root.tag_
|
||||||
)
|
attrs = {"tag": tag, "lemma": span.text}
|
||||||
doc.ents = doc.ents + ((label, span.start, span.end),)
|
retokenizer.merge(span, attrs=attrs)
|
||||||
|
doc.ents = doc.ents + (span,)
|
||||||
|
|
||||||
text = "The golf club is broken"
|
text = "The golf club is broken"
|
||||||
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
|
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
|
||||||
|
@ -410,7 +411,7 @@ def test_issue957(en_tokenizer):
|
||||||
"""
|
"""
|
||||||
# Skip test if pytest-timeout is not installed
|
# Skip test if pytest-timeout is not installed
|
||||||
pytest.importorskip("pytest_timeout")
|
pytest.importorskip("pytest_timeout")
|
||||||
for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']:
|
for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
|
||||||
string = "0"
|
string = "0"
|
||||||
for i in range(1, 100):
|
for i in range(1, 100):
|
||||||
string += punct + str(i)
|
string += punct + str(i)
|
||||||
|
|
|
@ -86,7 +86,8 @@ def test_issue1547():
|
||||||
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
|
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
|
||||||
doc = Doc(Vocab(), words=words)
|
doc = Doc(Vocab(), words=words)
|
||||||
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
|
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
|
||||||
doc[5:7].merge()
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.merge(doc[5:7])
|
||||||
assert [ent.text for ent in doc.ents]
|
assert [ent.text for ent in doc.ents]
|
||||||
|
|
||||||
|
|
||||||
|
|
11
spacy/tests/regression/test_issue3277.py
Normal file
11
spacy/tests/regression/test_issue3277.py
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3277(es_tokenizer):
|
||||||
|
"""Test that hyphens are split correctly as prefixes."""
|
||||||
|
doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.")
|
||||||
|
assert len(doc) == 14
|
||||||
|
assert doc[0].text == "\u2014"
|
||||||
|
assert doc[5].text == "\u2013"
|
||||||
|
assert doc[9].text == "\u2013"
|
|
@ -898,6 +898,7 @@ cdef class Doc:
|
||||||
indices did not fall at token boundaries.
|
indices did not fall at token boundaries.
|
||||||
"""
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef unicode tag, lemma, ent_type
|
||||||
|
deprecation_warning(Warnings.W013.format(obj="Doc"))
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
deprecation_warning(Warnings.W003)
|
deprecation_warning(Warnings.W003)
|
||||||
tag, lemma, ent_type = args
|
tag, lemma, ent_type = args
|
||||||
|
|
|
@ -18,6 +18,7 @@ from ..attrs cimport *
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..compat import is_config, basestring_
|
from ..compat import is_config, basestring_
|
||||||
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
|
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
|
||||||
|
from ..errors import deprecation_warning
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
|
||||||
|
@ -193,6 +194,7 @@ cdef class Span:
|
||||||
attributes are inherited from the syntactic root token of the span.
|
attributes are inherited from the syntactic root token of the span.
|
||||||
RETURNS (Token): The newly merged token.
|
RETURNS (Token): The newly merged token.
|
||||||
"""
|
"""
|
||||||
|
deprecation_warning(Warnings.W013.format(obj="Span"))
|
||||||
return self.doc.merge(self.start_char, self.end_char, *args,
|
return self.doc.merge(self.start_char, self.end_char, *args,
|
||||||
**attributes)
|
**attributes)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user