Improve tokenization for UD Spanish AnCora (#5253)

This commit is contained in:
adrianeboyd 2020-04-06 13:18:23 +02:00 committed by GitHub
parent f4ef64a526
commit e8be15e9b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 1 deletions

View File

@ -6,6 +6,7 @@ from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -23,6 +24,8 @@ class SpanishDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS

View File

@ -0,0 +1,48 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
from ..char_classes import merge_chars
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_list_units = [u for u in LIST_UNITS if u != "%"]
_units = merge_chars(" ".join(_list_units))
_concat_quotes = CONCAT_QUOTES + "—–"
_suffixes = (
["", ""]
+ LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=_units),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=_concat_quotes, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=_concat_quotes
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_SUFFIXES = _suffixes
TOKENIZER_INFIXES = _infixes

View File

@ -43,14 +43,16 @@ for orth in [
"Av.",
"Avda.",
"Cía.",
"EE.UU.",
"etc.",
"fig.",
"Gob.",
"Gral.",
"Ing.",
"J.C.",
"km/h",
"Lic.",
"m.n.",
"no.",
"núm.",
"P.D.",
"Prof.",