From e8be15e9b79ba66497b59947c31604b48793bde0 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 6 Apr 2020 13:18:23 +0200 Subject: [PATCH] Improve tokenization for UD Spanish AnCora (#5253) --- spacy/lang/es/__init__.py | 3 ++ spacy/lang/es/punctuation.py | 48 +++++++++++++++++++++++++++ spacy/lang/es/tokenizer_exceptions.py | 4 ++- 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/es/punctuation.py diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..249748a17 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -6,6 +6,7 @@ from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS @@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults): ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tag_map = TAG_MAP + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS syntax_iterators = SYNTAX_ITERATORS diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py new file mode 100644 index 000000000..42335237c --- /dev/null +++ b/spacy/lang/es/punctuation.py @@ -0,0 +1,48 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES +from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA +from ..char_classes import merge_chars +from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES + + +_list_units = [u for u in LIST_UNITS if u != "%"] +_units = merge_chars(" ".join(_list_units)) +_concat_quotes = CONCAT_QUOTES + "—–" + + +_suffixes = ( + ["—", "–"] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..2c2631086 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -43,14 +43,16 @@ for orth in [ "Av.", "Avda.", "Cía.", + "EE.UU.", "etc.", + "fig.", "Gob.", "Gral.", "Ing.", "J.C.", + "km/h", "Lic.", "m.n.", - "no.", "núm.", "P.D.", "Prof.",