From b98d216205068024407ae02c27c06da04ea88ff8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 11 Jun 2021 10:21:22 +0200 Subject: [PATCH] Update Catalan language data (#8308) * Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up --- setup.cfg | 2 +- spacy/lang/ca/__init__.py | 22 +++++++- spacy/lang/ca/lemmatizer.py | 81 +++++++++++++++++++++++++++ spacy/lang/ca/punctuation.py | 44 +++++++++++++-- spacy/lang/ca/syntax_iterators.py | 46 +++++++++++++++ spacy/lang/ca/tokenizer_exceptions.py | 7 +++ spacy/tests/lang/ca/test_text.py | 4 +- spacy/tests/lang/test_lemmatizers.py | 2 +- 8 files changed, 198 insertions(+), 10 deletions(-) create mode 100644 spacy/lang/ca/lemmatizer.py create mode 100644 spacy/lang/ca/syntax_iterators.py diff --git a/setup.cfg b/setup.cfg index 2fedd8f5c..cd55911fe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,7 +65,7 @@ console_scripts = [options.extras_require] lookups = - spacy_lookups_data>=1.0.0,<1.1.0 + spacy_lookups_data>=1.0.1,<1.1.0 transformers = spacy_transformers>=1.0.1,<1.1.0 ray = diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 970b23c1e..81f39b13c 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,15 +1,23 @@ +from typing import Optional + +from thinc.api import Model + from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .punctuation import TOKENIZER_INFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS +from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language +from .lemmatizer import CatalanLemmatizer class CatalanDefaults(Language.Defaults): tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS + syntax_iterators = SYNTAX_ITERATORS class Catalan(Language): @@ -17,4 +25,16 @@ class Catalan(Language): Defaults = CatalanDefaults +@Catalan.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "overwrite": False}, + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool +): + return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) + + __all__ = ["Catalan"] diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py new file mode 100644 index 000000000..2518eb720 --- /dev/null +++ b/spacy/lang/ca/lemmatizer.py @@ -0,0 +1,81 @@ +from typing import List, Tuple + +from ...pipeline import Lemmatizer +from ...tokens import Token + + +class CatalanLemmatizer(Lemmatizer): + """ + Copied from French Lemmatizer + Catalan language lemmatizer applies the default rule based lemmatization + procedure with some modifications for better Catalan language support. + + The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use + the rule-based lemmatization. As a last resort, the lemmatizer checks in + the lookup table. + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: + if mode == "rule": + required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] + return (required, []) + else: + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + elif "lemma_rules" not in self.lookups or univ_pos not in ( + "noun", + "verb", + "adj", + "adp", + "adv", + "aux", + "cconj", + "det", + "pron", + "punct", + "sconj", + ): + return self.lookup_lemmatize(token) + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + lookup_table = self.lookups.get_table("lemma_lookup", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, []) + string = string.lower() + forms = [] + if string in index: + forms.append(string) + self.cache[cache_key] = forms + return forms + forms.extend(exceptions.get(string, [])) + oov_forms = [] + if not forms: + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + if not forms: + forms.extend(oov_forms) + if not forms and string in lookup_table.keys(): + forms.append(self.lookup_lemmatize(token)[0]) + if not forms: + forms.append(string) + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index d50b75589..39db08f17 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,12 +1,46 @@ -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import ALPHA +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import CURRENCY +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT +from ..char_classes import merge_chars, _units ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -_infixes = TOKENIZER_INFIXES + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION) -] +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION), + ] +) + +_units = _units.replace("% ", "") +UNITS = merge_chars(_units) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_ICONS + + [r"-", "—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] +) TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py new file mode 100644 index 000000000..c70d53e80 --- /dev/null +++ b/spacy/lang/ca/syntax_iterators.py @@ -0,0 +1,46 @@ +from ...symbols import NOUN, PROPN +from ...errors import Errors + + +def noun_chunks(doclike): + """Detect base noun phrases from a dependency parse. Works on Doc and Span.""" + # fmt: off + labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] + # fmt: on + doc = doclike.doc # Ensure works on both Doc and Span. + if not doc.has_annotation("DEP"): + raise ValueError(Errors.E029) + np_deps = [doc.vocab.strings[label] for label in labels] + np_label = doc.vocab.strings.add("NP") + prev_end = -1 + for i, word in enumerate(doclike): + if word.pos not in (NOUN, PROPN): + continue + # Prevent nested chunks from being produced + if word.left_edge.i <= prev_end: + continue + if word.dep in np_deps: + left = word.left_edge.i + right = word.right_edge.i + 1 + # leave prepositions and punctuation out of the left side of the chunk + if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT": + left = word.left_edge.i + 1 + prev_end = word.right_edge.i + # leave subordinated clauses and appositions out of the chunk + a = word.i + 1 + while a < word.right_edge.i: + paraula = doc[a] + if paraula.pos_ == "VERB": + right = paraula.left_edge.i + prev_end = paraula.left_edge.i - 1 + elif paraula.dep_ == "appos": + right = paraula.left_edge.i + 1 + prev_end = paraula.left_edge.i - 1 + a += 1 + # leave punctuation out of the right side of the chunk + if word.right_edge.pos_ == "PUNCT": + right = right - 1 + yield left, right, np_label + + +SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index b465e97ba..5f9a50f5e 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -24,6 +24,13 @@ for exc_data in [ {ORTH: "núm", NORM: "número"}, {ORTH: "St.", NORM: "sant"}, {ORTH: "Sta.", NORM: "santa"}, + {ORTH: "'l"}, + {ORTH: "'ls"}, + {ORTH: "'m"}, + {ORTH: "'n"}, + {ORTH: "'ns"}, + {ORTH: "'s"}, + {ORTH: "'t"}, ]: _exc[exc_data[ORTH]] = [exc_data] diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 38f5fc708..55bad0e94 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer): una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida.""" tokens = ca_tokenizer(text) - assert len(tokens) == 138 + assert len(tokens) == 140 @pytest.mark.parametrize( "text,length", [ - ("Perquè va anar-hi?", 6), + ("Perquè va anar-hi?", 4), ("“Ah no?”", 5), ("""Sí! "Anem", va contestar el Joan Carles""", 11), ("Van córrer aprox. 10km", 5), diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index e755da22d..e419f0a14 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -8,7 +8,7 @@ from spacy.util import get_lang_class # Only include languages with no external dependencies # excluded: ru, uk # excluded for custom tables: es, pl -LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] +LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"] # fmt: on