Update Catalan language data (#8308)

* Update Catalan language data

Update Catalan language data based on contributions from the Text Mining
Unit at the Barcelona Supercomputing Center:

https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data

* Update tokenizer settings for UD Catalan AnCora

Update for UD Catalan AnCora v2.7 with merged multi-word tokens.

* Update test

* Move prefix patternt to more generic infix pattern

* Clean up
This commit is contained in:
Adriane Boyd 2021-06-11 10:21:22 +02:00 committed by GitHub
parent d9be9e6cf9
commit b98d216205
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 198 additions and 10 deletions

View File

@ -65,7 +65,7 @@ console_scripts =
[options.extras_require]
lookups =
spacy_lookups_data>=1.0.0,<1.1.0
spacy_lookups_data>=1.0.1,<1.1.0
transformers =
spacy_transformers>=1.0.1,<1.1.0
ray =

View File

@ -1,15 +1,23 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from .lemmatizer import CatalanLemmatizer
class CatalanDefaults(Language.Defaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
syntax_iterators = SYNTAX_ITERATORS
class Catalan(Language):
@ -17,4 +25,16 @@ class Catalan(Language):
Defaults = CatalanDefaults
@Catalan.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Catalan"]

View File

@ -0,0 +1,81 @@
from typing import List, Tuple
from ...pipeline import Lemmatizer
from ...tokens import Token
class CatalanLemmatizer(Lemmatizer):
"""
Copied from French Lemmatizer
Catalan language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Catalan language support.
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
the rule-based lemmatization. As a last resort, the lemmatizer checks in
the lookup table.
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
return (required, [])
else:
return super().get_lookups_config(mode)
def rule_lemmatize(self, token: Token) -> List[str]:
cache_key = (token.orth, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
return [string.lower()]
elif "lemma_rules" not in self.lookups or univ_pos not in (
"noun",
"verb",
"adj",
"adp",
"adv",
"aux",
"cconj",
"det",
"pron",
"punct",
"sconj",
):
return self.lookup_lemmatize(token)
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
lookup_table = self.lookups.get_table("lemma_lookup", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms and string in lookup_table.keys():
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
forms = list(set(forms))
self.cache[cache_key] = forms
return forms

View File

@ -1,12 +1,46 @@
from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import CURRENCY
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
from ..char_classes import merge_chars, _units
ELISION = " ' ".strip().replace(" ", "").replace("\n", "")
_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
]
)
_units = _units.replace("% ", "")
UNITS = merge_chars(_units)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_ICONS
+ [r"-", "", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
)
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -0,0 +1,46 @@
from ...symbols import NOUN, PROPN
from ...errors import Errors
def noun_chunks(doclike):
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
# fmt: off
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
# fmt: on
doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
np_deps = [doc.vocab.strings[label] for label in labels]
np_label = doc.vocab.strings.add("NP")
prev_end = -1
for i, word in enumerate(doclike):
if word.pos not in (NOUN, PROPN):
continue
# Prevent nested chunks from being produced
if word.left_edge.i <= prev_end:
continue
if word.dep in np_deps:
left = word.left_edge.i
right = word.right_edge.i + 1
# leave prepositions and punctuation out of the left side of the chunk
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
left = word.left_edge.i + 1
prev_end = word.right_edge.i
# leave subordinated clauses and appositions out of the chunk
a = word.i + 1
while a < word.right_edge.i:
paraula = doc[a]
if paraula.pos_ == "VERB":
right = paraula.left_edge.i
prev_end = paraula.left_edge.i - 1
elif paraula.dep_ == "appos":
right = paraula.left_edge.i + 1
prev_end = paraula.left_edge.i - 1
a += 1
# leave punctuation out of the right side of the chunk
if word.right_edge.pos_ == "PUNCT":
right = right - 1
yield left, right, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -24,6 +24,13 @@ for exc_data in [
{ORTH: "núm", NORM: "número"},
{ORTH: "St.", NORM: "sant"},
{ORTH: "Sta.", NORM: "santa"},
{ORTH: "'l"},
{ORTH: "'ls"},
{ORTH: "'m"},
{ORTH: "'n"},
{ORTH: "'ns"},
{ORTH: "'s"},
{ORTH: "'t"},
]:
_exc[exc_data[ORTH]] = [exc_data]

View File

@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
tokens = ca_tokenizer(text)
assert len(tokens) == 138
assert len(tokens) == 140
@pytest.mark.parametrize(
"text,length",
[
("Perquè va anar-hi?", 6),
("Perquè va anar-hi?", 4),
("“Ah no?”", 5),
("""Sí! "Anem", va contestar el Joan Carles""", 11),
("Van córrer aprox. 10km", 5),

View File

@ -8,7 +8,7 @@ from spacy.util import get_lang_class
# Only include languages with no external dependencies
# excluded: ru, uk
# excluded for custom tables: es, pl
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
# fmt: on