Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2019-02-15 10:30:22 +01:00
commit a66e8e0c8a
16 changed files with 120 additions and 160 deletions

View File

@ -112,10 +112,10 @@ def write_conllu(docs, file_):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans] with doc.retokenize() as retokenizer:
for start_char, end_char in offsets: for span in spans:
doc.merge(start_char, end_char) retokenizer.merge(span)
# TODO: This shuldn't be necessary? Should be handled in merge # TODO: This shouldn't be necessary? Should be handled in merge
for word in doc: for word in doc:
if word.i == word.head.i: if word.i == word.head.i:
word.dep_ = "ROOT" word.dep_ = "ROOT"

View File

@ -217,9 +217,9 @@ def write_conllu(docs, file_):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans] with doc.retokenize() as retokenizer:
for start_char, end_char in offsets: for span in spans:
doc.merge(start_char, end_char) retokenizer.merge(span)
file_.write("# newdoc id = {i}\n".format(i=i)) file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))

View File

@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}):
if not doc.is_parsed: if not doc.is_parsed:
user_warning(Warnings.W005) user_warning(Warnings.W005)
if options.get("collapse_phrases", False): if options.get("collapse_phrases", False):
for np in list(doc.noun_chunks): with doc.retokenize() as retokenizer:
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_) for np in list(doc.noun_chunks):
attrs = {
"tag": np.root.tag_,
"lemma": np.root.lemma_,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
if options.get("collapse_punct", True): if options.get("collapse_punct", True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:
@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}):
while end < len(doc) and doc[end].is_punct: while end < len(doc) and doc[end].is_punct:
end += 1 end += 1
span = doc[start:end] span = doc[start:end]
spans.append( spans.append((span, word.tag_, word.lemma_, word.ent_type_))
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_) with doc.retokenize() as retokenizer:
) for span, tag, lemma, ent_type in spans:
for start, end, tag, lemma, ent_type in spans: attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type) retokenizer.merge(span, attrs=attrs)
if options.get("fine_grained"): if options.get("fine_grained"):
words = [{"text": w.text, "tag": w.tag_} for w in doc] words = [{"text": w.text, "tag": w.tag_} for w in doc]
else: else:

View File

@ -67,6 +67,9 @@ class Warnings(object):
"components are applied. To only create tokenized Doc objects, " "components are applied. To only create tokenized Doc objects, "
"try using `nlp.make_doc(text)` or process all texts as a stream " "try using `nlp.make_doc(text)` or process all texts as a stream "
"using `list(nlp.tokenizer.pipe(all_texts))`.") "using `list(nlp.tokenizer.pipe(all_texts))`.")
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
"efficient and less error-prone Doc.retokenize context manager "
"instead.")
@add_codes @add_codes

View File

@ -1,14 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import HYPHENS from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_prefixes = ( _prefixes = (
["§", "%", "=", r"\+(?![0-9])"] ["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT + LIST_PUNCT
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
@ -22,13 +21,15 @@ _suffixes = (
+ LIST_ELLIPSES + LIST_ELLIPSES
+ LIST_QUOTES + LIST_QUOTES
+ LIST_ICONS + LIST_ICONS
+ ["'s", "'S", "s", "S"] + ["'s", "'S", "s", "S", "", ""]
+ [ + [
r"(?<=[0-9])\+", r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.", r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
] ]
) )
@ -40,8 +41,8 @@ _infixes = (
r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -4,9 +4,6 @@ from __future__ import unicode_literals
from ..matcher import Matcher from ..matcher import Matcher
# TODO: replace doc.merge with doc.retokenize
def merge_noun_chunks(doc): def merge_noun_chunks(doc):
"""Merge noun chunks into a single token. """Merge noun chunks into a single token.
@ -15,11 +12,10 @@ def merge_noun_chunks(doc):
""" """
if not doc.is_parsed: if not doc.is_parsed:
return doc return doc
spans = [ with doc.retokenize() as retokenizer:
(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks for np in doc.noun_chunks:
] attrs = {"tag": np.root.tag, "dep": np.root.dep}
for start, end, tag, dep in spans: retokenizer.merge(np, attrs=attrs)
doc.merge(start, end, tag=tag, dep=dep)
return doc return doc
@ -29,11 +25,10 @@ def merge_entities(doc):
doc (Doc): The Doc object. doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun entities. RETURNS (Doc): The Doc object with merged noun entities.
""" """
spans = [ with doc.retokenize() as retokenizer:
(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents for ent in doc.ents:
] attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
for start, end, tag, dep, ent_type in spans: retokenizer.merge(ent, attrs=attrs)
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
return doc return doc
@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"):
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
matches = merger(doc) matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans] with doc.retokenize() as retokenizer:
for start_char, end_char in offsets: for span in spans:
doc.merge(start_char, end_char) retokenizer.merge(span)
return doc return doc

View File

@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer):
def test_doc_api_merge(en_tokenizer): def test_doc_api_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night" text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
# merge 'The Beach Boys'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(
doc[4].idx,
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
# merge 'all night'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(
doc[7].idx,
doc[8].idx + len(doc[8]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 8
assert doc[7].text == "all night"
assert doc[7].text_with_ws == "all night"
# merge both with bulk merge # merge both with bulk merge
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 9 assert len(doc) == 9
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge( retokenizer.merge(doc[4:7], attrs=attrs)
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} retokenizer.merge(doc[7:9], attrs=attrs)
)
retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
# merge both with bulk merge
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
assert len(doc) == 6 assert len(doc) == 6
assert doc[4].text == "the beach boys" assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys " assert doc[4].text_with_ws == "the beach boys "
@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer):
def test_doc_api_merge_children(en_tokenizer): def test_doc_api_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging.""" """Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night" text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 9 assert len(doc) == 9
doc.merge( with doc.retokenize() as retokenizer:
doc[4].idx, retokenizer.merge(doc[4:7], attrs=attrs)
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
for word in doc: for word in doc:
if word.i < word.head.i: if word.i < word.head.i:
assert word in list(word.head.lefts) assert word in list(word.head.lefts)
@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer):
def test_doc_api_merge_hang(en_tokenizer): def test_doc_api_merge_hang(en_tokenizer):
text = "through North and South Carolina" text = "through North and South Carolina"
doc = en_tokenizer(text) doc = en_tokenizer(text)
doc.merge(18, 32, tag="", lemma="", ent_type="ORG") with doc.retokenize() as retokenizer:
doc.merge(8, 32, tag="", lemma="", ent_type="ORG") retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_api_retokenizer(en_tokenizer): def test_doc_api_retokenizer(en_tokenizer):
@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer):
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
"ROOT", "amod", "dobj"] "ROOT", "amod", "dobj"]
# fmt: on # fmt: on
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
nps = [] nps = []
for np in doc.noun_chunks: for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
np = np[1:] np = np[1:]
if len(np) > 1: if len(np) > 1:
nps.append( nps.append(np)
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_) with doc.retokenize() as retokenizer:
) for np in nps:
for np in nps: attrs = {
start, end, tag, lemma, ent_type = np "tag": np.root.tag_,
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type) "lemma": np.text,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
def test_doc_api_right_edge(en_tokenizer): def test_doc_api_right_edge(en_tokenizer):

View File

@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer):
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE") with doc.retokenize() as retokenizer:
assert len(doc) == 3 attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
assert doc[0].text == "Los Angeles" retokenizer.merge(doc[0:2], attrs=attrs)
assert doc[0].head.text == "start"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert len(doc) == 4
assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
assert len(doc) == 3 assert len(doc) == 3
assert doc[0].text == "Los Angeles" assert doc[0].text == "Los Angeles"
assert doc[0].head.text == "start" assert doc[0].head.text == "start"
@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer):
heads = [1, 0, 2, 1, -3, -1, -1, -1] heads = [1, 0, 2, 1, -3, -1, -1, -1]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert doc[4].head.i == 1 assert doc[4].head.i == 1
doc.merge( with doc.retokenize() as retokenizer:
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O" attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
) retokenizer.merge(doc[2:5], attrs=attrs)
assert doc[2].head.i == 1 assert doc[2].head.i == 1
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15] heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
with doc.retokenize() as retokenizer:
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents] for ent in doc.ents:
for start, end, label, lemma in ents: attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label) retokenizer.merge(ent, attrs=attrs)
assert merged is not None, (start, end, label, lemma)
text = "One test with entities like New York City so the ents list is not void" text = "One test with entities like New York City so the ents list is not void"
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
for span in doc.ents: with doc.retokenize() as retokenizer:
merged = doc.merge() for ent in doc.ents:
assert merged is not None, (span.start, span.end, span.label_, span.lemma_) retokenizer.merge(ent)
def test_spans_entity_merge(en_tokenizer): def test_spans_entity_merge(en_tokenizer):
@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer):
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
) )
assert len(doc) == 17 assert len(doc) == 17
for ent in doc.ents: with doc.retokenize() as retokenizer:
label, lemma, type_ = ( for ent in doc.ents:
ent.root.tag_, ent_type = max(w.ent_type_ for w in ent)
ent.root.lemma_, attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type}
max(w.ent_type_ for w in ent), retokenizer.merge(ent, attrs=attrs)
)
ent.merge(label=label, lemma=lemma, ent_type=type_)
# check looping is ok # check looping is ok
assert len(doc) == 15 assert len(doc) == 15
@ -132,7 +120,8 @@ def test_spans_entity_merge_iob():
assert doc[1].ent_iob_ == "I" assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I" assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B" assert doc[3].ent_iob_ == "B"
doc[0:1].merge() with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1])
assert doc[0].ent_iob_ == "B" assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I" assert doc[1].ent_iob_ == "I"
@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
sent1, sent2 = list(doc.sents) sent1, sent2 = list(doc.sents)
init_len = len(sent1) init_len = len(sent1)
init_len2 = len(sent2) init_len2 = len(sent2)
doc[0:2].merge(label="none", lemma="none", ent_type="none") with doc.retokenize() as retokenizer:
doc[-2:].merge(label="none", lemma="none", ent_type="none") attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs)
retokenizer.merge(doc[-2:], attrs=attrs)
assert len(sent1) == init_len - 1 assert len(sent1) == init_len - 1
assert len(sent2) == init_len2 - 1 assert len(sent2) == init_len2 - 1
@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1 = list(doc.sents)[0] sent1 = list(doc.sents)[0]
init_len = len(list(sent1.root.subtree)) init_len = len(list(sent1.root.subtree))
doc[0:2].merge(label="none", lemma="none", ent_type="none") with doc.retokenize() as retokenizer:
attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs)
assert len(list(sent1.root.subtree)) == init_len - 1 assert len(list(sent1.root.subtree)) == init_len - 1

View File

@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
@pytest.mark.parametrize("lang", LANGUAGES) @pytest.mark.parametrize("lang", LANGUAGES)
def test_lang_initialize(lang): def test_lang_initialize(lang):
"""Test that languages can be initialized.""" """Test that languages can be initialized."""
lang_cls = get_lang_class(lang)() lang_cls = get_lang_class(lang)() # noqa: F841

View File

@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab):
if doc.vocab.strings[match_id] == "HAPPY": if doc.vocab.strings[match_id] == "HAPPY":
doc.sentiment += 0.1 doc.sentiment += 0.1
span = doc[start:end] span = doc[start:end]
token = span.merge() with doc.retokenize() as retokenizer:
retokenizer.merge(span)
token = doc[start]
token.vocab[token.text].norm_ = "happy emoji" token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)

View File

@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer):
doc = get_doc( doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
) )
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks] with doc.retokenize() as retokenizer:
for start, end, lemma in nps: for np in doc.noun_chunks:
doc.merge(start, end, label="NP", lemma=lemma) retokenizer.merge(np, attrs={"lemma": np.lemma_})
assert doc[0].text == "A phrase" assert doc[0].text == "A phrase"
assert doc[1].text == "with" assert doc[1].text == "with"
assert doc[2].text == "another phrase" assert doc[2].text == "another phrase"

View File

@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.language import Language from spacy.language import Language
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.tokens import Doc from spacy.tokens import Doc, Span
from ..util import get_doc, make_tempdir from ..util import get_doc, make_tempdir
@ -204,12 +204,13 @@ def test_issue615(en_tokenizer):
on the last match.""" on the last match."""
if i != len(matches) - 1: if i != len(matches) - 1:
return None return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] spans = [Span(doc, start, end, label=label) for label, start, end in matches]
for ent_id, label, span in spans: with doc.retokenize() as retokenizer:
span.merge( for span in spans:
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label tag = "NNP" if span.label_ else span.root.tag_
) attrs = {"tag": tag, "lemma": span.text}
doc.ents = doc.ents + ((label, span.start, span.end),) retokenizer.merge(span, attrs=attrs)
doc.ents = doc.ents + (span,)
text = "The golf club is broken" text = "The golf club is broken"
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
@ -410,7 +411,7 @@ def test_issue957(en_tokenizer):
""" """
# Skip test if pytest-timeout is not installed # Skip test if pytest-timeout is not installed
pytest.importorskip("pytest_timeout") pytest.importorskip("pytest_timeout")
for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']: for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
string = "0" string = "0"
for i in range(1, 100): for i in range(1, 100):
string += punct + str(i) string += punct + str(i)

View File

@ -86,7 +86,8 @@ def test_issue1547():
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
doc[5:7].merge() with doc.retokenize() as retokenizer:
retokenizer.merge(doc[5:7])
assert [ent.text for ent in doc.ents] assert [ent.text for ent in doc.ents]

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
assert len(doc) == 14
assert doc[0].text == "\u2014"
assert doc[5].text == "\u2013"
assert doc[9].text == "\u2013"

View File

@ -898,6 +898,7 @@ cdef class Doc:
indices did not fall at token boundaries. indices did not fall at token boundaries.
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
deprecation_warning(Warnings.W013.format(obj="Doc"))
if len(args) == 3: if len(args) == 3:
deprecation_warning(Warnings.W003) deprecation_warning(Warnings.W003)
tag, lemma, ent_type = args tag, lemma, ent_type = args

View File

@ -18,6 +18,7 @@ from ..attrs cimport *
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config, basestring_ from ..compat import is_config, basestring_
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
from ..errors import deprecation_warning
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -193,6 +194,7 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token. RETURNS (Token): The newly merged token.
""" """
deprecation_warning(Warnings.W013.format(obj="Span"))
return self.doc.merge(self.start_char, self.end_char, *args, return self.doc.merge(self.start_char, self.end_char, *args,
**attributes) **attributes)