From 5651a0d052bcfd160b187828aa3d8d90652929fe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Feb 2019 10:29:44 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=92=AB=20Replace=20{Doc,Span}.merge?= =?UTF-8?q?=20with=20Doc.retokenize=20(#3280)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add deprecation warning to Doc.merge and Span.merge * Replace {Doc,Span}.merge with Doc.retokenize --- spacy/cli/ud/ud_run_test.py | 8 +- spacy/cli/ud/ud_train.py | 6 +- spacy/displacy/__init__.py | 20 ++-- spacy/errors.py | 3 + spacy/pipeline/functions.py | 27 +++--- spacy/tests/doc/test_doc_api.py | 92 ++++--------------- spacy/tests/doc/test_span_merge.py | 61 ++++++------ spacy/tests/lang/test_initialize.py | 2 +- spacy/tests/matcher/test_matcher_api.py | 4 +- spacy/tests/parser/test_parse.py | 6 +- spacy/tests/regression/test_issue1-1000.py | 17 ++-- spacy/tests/regression/test_issue1501-2000.py | 3 +- spacy/tokens/doc.pyx | 1 + spacy/tokens/span.pyx | 2 + 14 files changed, 100 insertions(+), 152 deletions(-) diff --git a/spacy/cli/ud/ud_run_test.py b/spacy/cli/ud/ud_run_test.py index 43140eb03..35c878721 100644 --- a/spacy/cli/ud/ud_run_test.py +++ b/spacy/cli/ud/ud_run_test.py @@ -112,10 +112,10 @@ def write_conllu(docs, file_): for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) - # TODO: This shuldn't be necessary? Should be handled in merge + with doc.retokenize() as retokenizer: + for span in spans: + retokenizer.merge(span) + # TODO: This shouldn't be necessary? Should be handled in merge for word in doc: if word.i == word.head.i: word.dep_ = "ROOT" diff --git a/spacy/cli/ud/ud_train.py b/spacy/cli/ud/ud_train.py index 6c4fbb3eb..563fcfb87 100644 --- a/spacy/cli/ud/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -217,9 +217,9 @@ def write_conllu(docs, file_): for i, doc in enumerate(docs): matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) + with doc.retokenize() as retokenizer: + for span in spans: + retokenizer.merge(span) file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3a3cba708..f8886848d 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -107,8 +107,14 @@ def parse_deps(orig_doc, options={}): if not doc.is_parsed: user_warning(Warnings.W005) if options.get("collapse_phrases", False): - for np in list(doc.noun_chunks): - np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_) + with doc.retokenize() as retokenizer: + for np in list(doc.noun_chunks): + attrs = { + "tag": np.root.tag_, + "lemma": np.root.lemma_, + "ent_type": np.root.ent_type_, + } + retokenizer.merge(np, attrs=attrs) if options.get("collapse_punct", True): spans = [] for word in doc[:-1]: @@ -119,11 +125,11 @@ def parse_deps(orig_doc, options={}): while end < len(doc) and doc[end].is_punct: end += 1 span = doc[start:end] - spans.append( - (span.start_char, span.end_char, word.tag_, word.lemma_, word.ent_type_) - ) - for start, end, tag, lemma, ent_type in spans: - doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type) + spans.append((span, word.tag_, word.lemma_, word.ent_type_)) + with doc.retokenize() as retokenizer: + for span, tag, lemma, ent_type in spans: + attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type} + retokenizer.merge(span, attrs=attrs) if options.get("fine_grained"): words = [{"text": w.text, "tag": w.tag_} for w in doc] else: diff --git a/spacy/errors.py b/spacy/errors.py index 37deb4560..f73e38c6a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -67,6 +67,9 @@ class Warnings(object): "components are applied. To only create tokenized Doc objects, " "try using `nlp.make_doc(text)` or process all texts as a stream " "using `list(nlp.tokenizer.pipe(all_texts))`.") + W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more " + "efficient and less error-prone Doc.retokenize context manager " + "instead.") @add_codes diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 23000d948..925f0e0fc 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -4,9 +4,6 @@ from __future__ import unicode_literals from ..matcher import Matcher -# TODO: replace doc.merge with doc.retokenize - - def merge_noun_chunks(doc): """Merge noun chunks into a single token. @@ -15,11 +12,10 @@ def merge_noun_chunks(doc): """ if not doc.is_parsed: return doc - spans = [ - (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks - ] - for start, end, tag, dep in spans: - doc.merge(start, end, tag=tag, dep=dep) + with doc.retokenize() as retokenizer: + for np in doc.noun_chunks: + attrs = {"tag": np.root.tag, "dep": np.root.dep} + retokenizer.merge(np, attrs=attrs) return doc @@ -29,11 +25,10 @@ def merge_entities(doc): doc (Doc): The Doc object. RETURNS (Doc): The Doc object with merged noun entities. """ - spans = [ - (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents - ] - for start, end, tag, dep, ent_type in spans: - doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) + with doc.retokenize() as retokenizer: + for ent in doc.ents: + attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label} + retokenizer.merge(ent, attrs=attrs) return doc @@ -42,7 +37,7 @@ def merge_subtokens(doc, label="subtok"): merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) matches = merger(doc) spans = [doc[start : end + 1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) + with doc.retokenize() as retokenizer: + for span in spans: + retokenizer.merge(span) return doc diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 77632d7c7..878ecd240 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -141,66 +141,13 @@ def test_doc_api_set_ents(en_tokenizer): def test_doc_api_merge(en_tokenizer): text = "WKRO played songs by the beach boys all night" - - # merge 'The Beach Boys' - doc = en_tokenizer(text) - assert len(doc) == 9 - doc.merge( - doc[4].idx, - doc[6].idx + len(doc[6]), - tag="NAMED", - lemma="LEMMA", - ent_type="TYPE", - ) - assert len(doc) == 7 - assert doc[4].text == "the beach boys" - assert doc[4].text_with_ws == "the beach boys " - assert doc[4].tag_ == "NAMED" - - # merge 'all night' - doc = en_tokenizer(text) - assert len(doc) == 9 - doc.merge( - doc[7].idx, - doc[8].idx + len(doc[8]), - tag="NAMED", - lemma="LEMMA", - ent_type="TYPE", - ) - assert len(doc) == 8 - assert doc[7].text == "all night" - assert doc[7].text_with_ws == "all night" - + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} # merge both with bulk merge doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: - retokenizer.merge( - doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - ) - retokenizer.merge( - doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - ) - - assert len(doc) == 6 - assert doc[4].text == "the beach boys" - assert doc[4].text_with_ws == "the beach boys " - assert doc[4].tag_ == "NAMED" - assert doc[5].text == "all night" - assert doc[5].text_with_ws == "all night" - assert doc[5].tag_ == "NAMED" - - # merge both with bulk merge - doc = en_tokenizer(text) - assert len(doc) == 9 - with doc.retokenize() as retokenizer: - retokenizer.merge( - doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - ) - retokenizer.merge( - doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} - ) - + retokenizer.merge(doc[4:7], attrs=attrs) + retokenizer.merge(doc[7:9], attrs=attrs) assert len(doc) == 6 assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " @@ -213,16 +160,11 @@ def test_doc_api_merge(en_tokenizer): def test_doc_api_merge_children(en_tokenizer): """Test that attachments work correctly after merging.""" text = "WKRO played songs by the beach boys all night" + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} doc = en_tokenizer(text) assert len(doc) == 9 - doc.merge( - doc[4].idx, - doc[6].idx + len(doc[6]), - tag="NAMED", - lemma="LEMMA", - ent_type="TYPE", - ) - + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[4:7], attrs=attrs) for word in doc: if word.i < word.head.i: assert word in list(word.head.lefts) @@ -233,8 +175,9 @@ def test_doc_api_merge_children(en_tokenizer): def test_doc_api_merge_hang(en_tokenizer): text = "through North and South Carolina" doc = en_tokenizer(text) - doc.merge(18, 32, tag="", lemma="", ent_type="ORG") - doc.merge(8, 32, tag="", lemma="", ent_type="ORG") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[3:5], attrs={"lemma": "", "ent_type": "ORG"}) + retokenizer.merge(doc[1:2], attrs={"lemma": "", "ent_type": "ORG"}) def test_doc_api_retokenizer(en_tokenizer): @@ -287,21 +230,22 @@ def test_doc_api_runtime_error(en_tokenizer): "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", "ROOT", "amod", "dobj"] # fmt: on - tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) - nps = [] for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] if len(np) > 1: - nps.append( - (np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_) - ) - for np in nps: - start, end, tag, lemma, ent_type = np - doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type) + nps.append(np) + with doc.retokenize() as retokenizer: + for np in nps: + attrs = { + "tag": np.root.tag_, + "lemma": np.text, + "ent_type": np.root.ent_type_, + } + retokenizer.merge(np, attrs=attrs) def test_doc_api_right_edge(en_tokenizer): diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index 75a28380c..87d475f1f 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -16,17 +16,9 @@ def test_spans_merge_tokens(en_tokenizer): assert len(doc) == 4 assert doc[0].head.text == "Angeles" assert doc[1].head.text == "start" - doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE") - assert len(doc) == 3 - assert doc[0].text == "Los Angeles" - assert doc[0].head.text == "start" - - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - assert len(doc) == 4 - assert doc[0].head.text == "Angeles" - assert doc[1].head.text == "start" - doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE") - + with doc.retokenize() as retokenizer: + attrs = {"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"} + retokenizer.merge(doc[0:2], attrs=attrs) assert len(doc) == 3 assert doc[0].text == "Los Angeles" assert doc[0].head.text == "start" @@ -71,30 +63,28 @@ def test_span_np_merges(en_tokenizer): heads = [1, 0, 2, 1, -3, -1, -1, -1] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - assert doc[4].head.i == 1 - doc.merge( - doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O" - ) + with doc.retokenize() as retokenizer: + attrs = {"tag": "NP", "lemma": "tool", "ent_type": "O"} + retokenizer.merge(doc[2:5], attrs=attrs) assert doc[2].head.i == 1 text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - - ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents] - for start, end, label, lemma in ents: - merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label) - assert merged is not None, (start, end, label, lemma) + with doc.retokenize() as retokenizer: + for ent in doc.ents: + attrs = {"tag": ent.label_, "lemma": ent.lemma_, "ent_type": ent.label_} + retokenizer.merge(ent, attrs=attrs) text = "One test with entities like New York City so the ents list is not void" heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - for span in doc.ents: - merged = doc.merge() - assert merged is not None, (span.start, span.end, span.label_, span.lemma_) + with doc.retokenize() as retokenizer: + for ent in doc.ents: + retokenizer.merge(ent) def test_spans_entity_merge(en_tokenizer): @@ -109,13 +99,11 @@ def test_spans_entity_merge(en_tokenizer): tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents ) assert len(doc) == 17 - for ent in doc.ents: - label, lemma, type_ = ( - ent.root.tag_, - ent.root.lemma_, - max(w.ent_type_ for w in ent), - ) - ent.merge(label=label, lemma=lemma, ent_type=type_) + with doc.retokenize() as retokenizer: + for ent in doc.ents: + ent_type = max(w.ent_type_ for w in ent) + attrs = {"lemma": ent.root.lemma_, "ent_type": ent_type} + retokenizer.merge(ent, attrs=attrs) # check looping is ok assert len(doc) == 15 @@ -132,7 +120,8 @@ def test_spans_entity_merge_iob(): assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "B" - doc[0:1].merge() + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:1]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" @@ -172,8 +161,10 @@ def test_spans_sentence_update_after_merge(en_tokenizer): sent1, sent2 = list(doc.sents) init_len = len(sent1) init_len2 = len(sent2) - doc[0:2].merge(label="none", lemma="none", ent_type="none") - doc[-2:].merge(label="none", lemma="none", ent_type="none") + with doc.retokenize() as retokenizer: + attrs = {"lemma": "none", "ent_type": "none"} + retokenizer.merge(doc[0:2], attrs=attrs) + retokenizer.merge(doc[-2:], attrs=attrs) assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1 @@ -191,5 +182,7 @@ def test_spans_subtree_size_check(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) - doc[0:2].merge(label="none", lemma="none", ent_type="none") + with doc.retokenize() as retokenizer: + attrs = {"lemma": "none", "ent_type": "none"} + retokenizer.merge(doc[0:2], attrs=attrs) assert len(list(sent1.root.subtree)) == init_len - 1 diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 587d15dd7..7b303397e 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -18,4 +18,4 @@ LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", @pytest.mark.parametrize("lang", LANGUAGES) def test_lang_initialize(lang): """Test that languages can be initialized.""" - lang_cls = get_lang_class(lang)() + lang_cls = get_lang_class(lang)() # noqa: F841 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 9e87359d4..7f7ebfc73 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -46,7 +46,9 @@ def test_matcher_from_usage_docs(en_vocab): if doc.vocab.strings[match_id] == "HAPPY": doc.sentiment += 0.1 span = doc[start:end] - token = span.merge() + with doc.retokenize() as retokenizer: + retokenizer.merge(span) + token = doc[start] token.vocab[token.text].norm_ = "happy emoji" matcher = Matcher(en_vocab) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 04e31d649..c140cb485 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -66,9 +66,9 @@ def test_parser_merge_pp(en_tokenizer): doc = get_doc( tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags ) - nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks] - for start, end, lemma in nps: - doc.merge(start, end, label="NP", lemma=lemma) + with doc.retokenize() as retokenizer: + for np in doc.noun_chunks: + retokenizer.merge(np, attrs={"lemma": np.lemma_}) assert doc[0].text == "A phrase" assert doc[1].text == "with" assert doc[2].text == "another phrase" diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 1b75406cc..00024d74a 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -9,7 +9,7 @@ from spacy.symbols import POS, VERB, VerbForm_inf from spacy.vocab import Vocab from spacy.language import Language from spacy.lemmatizer import Lemmatizer -from spacy.tokens import Doc +from spacy.tokens import Doc, Span from ..util import get_doc, make_tempdir @@ -204,12 +204,13 @@ def test_issue615(en_tokenizer): on the last match.""" if i != len(matches) - 1: return None - spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] - for ent_id, label, span in spans: - span.merge( - tag="NNP" if label else span.root.tag_, lemma=span.text, label=label - ) - doc.ents = doc.ents + ((label, span.start, span.end),) + spans = [Span(doc, start, end, label=label) for label, start, end in matches] + with doc.retokenize() as retokenizer: + for span in spans: + tag = "NNP" if span.label_ else span.root.tag_ + attrs = {"tag": tag, "lemma": span.text} + retokenizer.merge(span, attrs=attrs) + doc.ents = doc.ents + (span,) text = "The golf club is broken" pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] @@ -410,7 +411,7 @@ def test_issue957(en_tokenizer): """ # Skip test if pytest-timeout is not installed pytest.importorskip("pytest_timeout") - for punct in ['.', ',', '\'', '\"', ':', '?', '!', ';', '-']: + for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]: string = "0" for i in range(1, 100): string += punct + str(i) diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 1bb6cebe1..28ebcb0a9 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -86,7 +86,8 @@ def test_issue1547(): words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] - doc[5:7].merge() + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[5:7]) assert [ent.text for ent in doc.ents] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3b8d0daed..ba0801e34 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -898,6 +898,7 @@ cdef class Doc: indices did not fall at token boundaries. """ cdef unicode tag, lemma, ent_type + deprecation_warning(Warnings.W013.format(obj="Doc")) if len(args) == 3: deprecation_warning(Warnings.W003) tag, lemma, ent_type = args diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 593e6ddec..a418fc13f 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -18,6 +18,7 @@ from ..attrs cimport * from ..lexeme cimport Lexeme from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning +from ..errors import deprecation_warning from .underscore import Underscore, get_ext_args @@ -193,6 +194,7 @@ cdef class Span: attributes are inherited from the syntactic root token of the span. RETURNS (Token): The newly merged token. """ + deprecation_warning(Warnings.W013.format(obj="Span")) return self.doc.merge(self.start_char, self.end_char, *args, **attributes) From c31a9dabd53de47aa3bda065d95944bb61ffec78 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Feb 2019 10:29:59 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=92=AB=20=20Add=20en/em=20dash=20to?= =?UTF-8?q?=20prefixes=20and=20suffixes=20=20(#3281)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Auto-format * Add en/em dash to prefixes and suffixes --- spacy/lang/punctuation.py | 17 +++++++++-------- spacy/tests/regression/test_issue3277.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/regression/test_issue3277.py diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 2ec8c3e43..17e20fa0c 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,14 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY, LIST_ICONS -from .char_classes import HYPHENS -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY +from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA _prefixes = ( - ["§", "%", "=", r"\+(?![0-9])"] + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES @@ -22,13 +21,15 @@ _suffixes = ( + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS - + ["'s", "'S", "’s", "’S"] + + ["'s", "'S", "’s", "’S", "—", "–"] + [ r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:{c})".format(c=CURRENCY), r"(?<=[0-9])(?:{u})".format(u=UNITS), - r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES), + r"(?<=[0-9{al}{e}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES + ), r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), ] ) @@ -40,8 +41,8 @@ _infixes = ( r"(?<=[0-9])[+\-\*^](?=[0-9-])", r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - r'(?<=[{a}])(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS), - r'(?<=[{a}])[:<>=/](?=[{a}])'.format(a=ALPHA), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py new file mode 100644 index 000000000..88ea67774 --- /dev/null +++ b/spacy/tests/regression/test_issue3277.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013"