Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2019-02-15 10:30:22 +01:00
commit a66e8e0c8a
16 changed files with 120 additions and 160 deletions

View File

@ -112,10 +112,10 @@ def write_conllu(docs, file_):
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
# TODO: This shuldn't be necessary? Should be handled in merge
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# TODO: This shouldn't be necessary? Should be handled in merge
for word in doc:
if word.i == word.head.i:
word.dep_ = "ROOT"

View File

@ -217,9 +217,9 @@ def write_conllu(docs, file_):
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))

View File

@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}):
if not doc.is_parsed:
user_warning(Warnings.W005)
if options.get("collapse_phrases", False):
with doc.retokenize() as retokenizer:
for np in list(doc.noun_chunks):
np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_)
attrs = {
"tag": np.root.tag_,
"lemma": np.root.lemma_,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
if options.get("collapse_punct", True):
spans = []
for word in doc[:-1]:
@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}):
while end < len(doc) and doc[end].is_punct:
end += 1
span = doc[start:end]
spans.append(
(span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_)
)
for start, end, tag, lemma, ent_type in spans:
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
spans.append((span, word.tag_, word.lemma_, word.ent_type_))
with doc.retokenize() as retokenizer:
for span, tag, lemma, ent_type in spans:
attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
retokenizer.merge(span, attrs=attrs)
if options.get("fine_grained"):
words = [{"text": w.text, "tag": w.tag_} for w in doc]
else:

View File

@ -67,6 +67,9 @@ class Warnings(object):
"components are applied. To only create tokenized Doc objects, "
"try using `nlp.make_doc(text)` or process all texts as a stream "
"using `list(nlp.tokenizer.pipe(all_texts))`.")
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
"efficient and less error-prone Doc.retokenize context manager "
"instead.")
@add_codes

View File

@ -1,14 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS
from .char_classes import HYPHENS
from .char_classes import CURRENCY, UNITS
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_prefixes = (
["§", "%", "=", r"\+(?![0-9])"]
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
@ -22,13 +21,15 @@ _suffixes = (
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ ["'s", "'S", "s", "S"]
+ ["'s", "'S", "s", "S", "", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)
@ -40,8 +41,8 @@ _infixes = (
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -4,9 +4,6 @@ from __future__ import unicode_literals
from ..matcher import Matcher
# TODO: replace doc.merge with doc.retokenize
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
@ -15,11 +12,10 @@ def merge_noun_chunks(doc):
"""
if not doc.is_parsed:
return doc
spans = [
(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
]
for start, end, tag, dep in spans:
doc.merge(start, end, tag=tag, dep=dep)
with doc.retokenize() as retokenizer:
for np in doc.noun_chunks:
attrs = {"tag": np.root.tag, "dep": np.root.dep}
retokenizer.merge(np, attrs=attrs)
return doc
@ -29,11 +25,10 @@ def merge_entities(doc):
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun entities.
"""
spans = [
(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
]
for start, end, tag, dep, ent_type in spans:
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
retokenizer.merge(ent, attrs=attrs)
return doc
@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"):
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
return doc

View File

@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer):
def test_doc_api_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
# merge 'The Beach Boys'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(
doc[4].idx,
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 7
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
# merge 'all night'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(
doc[7].idx,
doc[8].idx + len(doc[8]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 8
assert doc[7].text == "all night"
assert doc[7].text_with_ws == "all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
# merge both with bulk merge
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
# merge both with bulk merge
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
retokenizer.merge(doc[4:7], attrs=attrs)
retokenizer.merge(doc[7:9], attrs=attrs)
assert len(doc) == 6
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer):
def test_doc_api_merge_children(en_tokenizer):
"""Test that attachments work correctly after merging."""
text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(
doc[4].idx,
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
for word in doc:
if word.i < word.head.i:
assert word in list(word.head.lefts)
@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer):
def test_doc_api_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"})
retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"})
def test_doc_api_retokenizer(en_tokenizer):
@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer):
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
"ROOT", "amod", "dobj"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
nps = []
for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
np = np[1:]
if len(np) > 1:
nps.append(
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
)
nps.append(np)
with doc.retokenize() as retokenizer:
for np in nps:
start, end, tag, lemma, ent_type = np
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
attrs = {
"tag": np.root.tag_,
"lemma": np.text,
"ent_type": np.root.ent_type_,
}
retokenizer.merge(np, attrs=attrs)
def test_doc_api_right_edge(en_tokenizer):

View File

@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer):
assert len(doc) == 4
assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
assert len(doc) == 3
assert doc[0].text == "Los Angeles"
assert doc[0].head.text == "start"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert len(doc) == 4
assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
with doc.retokenize() as retokenizer:
attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"}
retokenizer.merge(doc[0:2], attrs=attrs)
assert len(doc) == 3
assert doc[0].text == "Los Angeles"
assert doc[0].head.text == "start"
@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer):
heads = [1, 0, 2, 1, -3, -1, -1, -1]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert doc[4].head.i == 1
doc.merge(
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
)
with doc.retokenize() as retokenizer:
attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"}
retokenizer.merge(doc[2:5], attrs=attrs)
assert doc[2].head.i == 1
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
for start, end, label, lemma in ents:
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
assert merged is not None, (start, end, label, lemma)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_}
retokenizer.merge(ent, attrs=attrs)
text = "One test with entities like New York City so the ents list is not void"
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
for span in doc.ents:
merged = doc.merge()
assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(ent)
def test_spans_entity_merge(en_tokenizer):
@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer):
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
)
assert len(doc) == 17
with doc.retokenize() as retokenizer:
for ent in doc.ents:
label, lemma, type_ = (
ent.root.tag_,
ent.root.lemma_,
max(w.ent_type_ for w in ent),
)
ent.merge(label=label, lemma=lemma, ent_type=type_)
ent_type = max(w.ent_type_ for w in ent)
attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type}
retokenizer.merge(ent, attrs=attrs)
# check looping is ok
assert len(doc) == 15
@ -132,7 +120,8 @@ def test_spans_entity_merge_iob():
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B"
doc[0:1].merge()
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer):
sent1, sent2 = list(doc.sents)
init_len = len(sent1)
init_len2 = len(sent2)
doc[0:2].merge(label="none", lemma="none", ent_type="none")
doc[-2:].merge(label="none", lemma="none", ent_type="none")
with doc.retokenize() as retokenizer:
attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs)
retokenizer.merge(doc[-2:], attrs=attrs)
assert len(sent1) == init_len - 1
assert len(sent2) == init_len2 - 1
@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1 = list(doc.sents)[0]
init_len = len(list(sent1.root.subtree))
doc[0:2].merge(label="none", lemma="none", ent_type="none")
with doc.retokenize() as retokenizer:
attrs = {"lemma": "none", "ent_type": "none"}
retokenizer.merge(doc[0:2], attrs=attrs)
assert len(list(sent1.root.subtree)) == init_len - 1

View File

@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
@pytest.mark.parametrize("lang", LANGUAGES)
def test_lang_initialize(lang):
"""Test that languages can be initialized."""
lang_cls = get_lang_class(lang)()
lang_cls = get_lang_class(lang)() # noqa: F841

View File

@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab):
if doc.vocab.strings[match_id] == "HAPPY":
doc.sentiment += 0.1
span = doc[start:end]
token = span.merge()
with doc.retokenize() as retokenizer:
retokenizer.merge(span)
token = doc[start]
token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab)

View File

@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer):
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
)
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
for start, end, lemma in nps:
doc.merge(start, end, label="NP", lemma=lemma)
with doc.retokenize() as retokenizer:
for np in doc.noun_chunks:
retokenizer.merge(np, attrs={"lemma": np.lemma_})
assert doc[0].text == "A phrase"
assert doc[1].text == "with"
assert doc[2].text == "another phrase"

View File

@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.lemmatizer import Lemmatizer
from spacy.tokens import Doc
from spacy.tokens import Doc, Span
from ..util import get_doc, make_tempdir
@ -204,12 +204,13 @@ def test_issue615(en_tokenizer):
on the last match."""
if i != len(matches) - 1:
return None
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge(
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
)
doc.ents = doc.ents + ((label, span.start, span.end),)
spans = [Span(doc, start, end, label=label) for label, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
tag = "NNP" if span.label_ else span.root.tag_
attrs = {"tag": tag, "lemma": span.text}
retokenizer.merge(span, attrs=attrs)
doc.ents = doc.ents + (span,)
text = "The golf club is broken"
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
@ -410,7 +411,7 @@ def test_issue957(en_tokenizer):
"""
# Skip test if pytest-timeout is not installed
pytest.importorskip("pytest_timeout")
for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']:
for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
string = "0"
for i in range(1, 100):
string += punct + str(i)

View File

@ -86,7 +86,8 @@ def test_issue1547():
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
doc[5:7].merge()
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[5:7])
assert [ent.text for ent in doc.ents]

View File

@ -0,0 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
def test_issue3277(es_tokenizer):
"""Test that hyphens are split correctly as prefixes."""
doc = es_tokenizer("—Yo me llamo... murmuró el niño Emilio Sánchez Pérez.")
assert len(doc) == 14
assert doc[0].text == "\u2014"
assert doc[5].text == "\u2013"
assert doc[9].text == "\u2013"

View File

@ -898,6 +898,7 @@ cdef class Doc:
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
deprecation_warning(Warnings.W013.format(obj="Doc"))
if len(args) == 3:
deprecation_warning(Warnings.W003)
tag, lemma, ent_type = args

View File

@ -18,6 +18,7 @@ from ..attrs cimport *
from ..lexeme cimport Lexeme
from ..compat import is_config, basestring_
from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning
from ..errors import deprecation_warning
from .underscore import Underscore, get_ext_args
@ -193,6 +194,7 @@ cdef class Span:
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
deprecation_warning(Warnings.W013.format(obj="Span"))
return self.doc.merge(self.start_char, self.end_char, *args,
**attributes)