mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Update Catalan language data (#8308)
* Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up
This commit is contained in:
parent
d9be9e6cf9
commit
b98d216205
|
@ -65,7 +65,7 @@ console_scripts =
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
spacy_lookups_data>=1.0.0,<1.1.0
|
spacy_lookups_data>=1.0.1,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.0.1,<1.1.0
|
spacy_transformers>=1.0.1,<1.1.0
|
||||||
ray =
|
ray =
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(Language.Defaults):
|
class CatalanDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
infixes = TOKENIZER_INFIXES
|
infixes = TOKENIZER_INFIXES
|
||||||
|
suffixes = TOKENIZER_SUFFIXES
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
syntax_iterators = SYNTAX_ITERATORS
|
||||||
|
|
||||||
|
|
||||||
class Catalan(Language):
|
class Catalan(Language):
|
||||||
|
@ -17,4 +25,16 @@ class Catalan(Language):
|
||||||
Defaults = CatalanDefaults
|
Defaults = CatalanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Catalan.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(
|
||||||
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||||
|
):
|
||||||
|
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Catalan"]
|
__all__ = ["Catalan"]
|
||||||
|
|
81
spacy/lang/ca/lemmatizer.py
Normal file
81
spacy/lang/ca/lemmatizer.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class CatalanLemmatizer(Lemmatizer):
|
||||||
|
"""
|
||||||
|
Copied from French Lemmatizer
|
||||||
|
Catalan language lemmatizer applies the default rule based lemmatization
|
||||||
|
procedure with some modifications for better Catalan language support.
|
||||||
|
|
||||||
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
||||||
|
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||||
|
the lookup table.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "rule":
|
||||||
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
cache_key = (token.orth, token.pos)
|
||||||
|
if cache_key in self.cache:
|
||||||
|
return self.cache[cache_key]
|
||||||
|
string = token.text
|
||||||
|
univ_pos = token.pos_.lower()
|
||||||
|
if univ_pos in ("", "eol", "space"):
|
||||||
|
return [string.lower()]
|
||||||
|
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
||||||
|
"noun",
|
||||||
|
"verb",
|
||||||
|
"adj",
|
||||||
|
"adp",
|
||||||
|
"adv",
|
||||||
|
"aux",
|
||||||
|
"cconj",
|
||||||
|
"det",
|
||||||
|
"pron",
|
||||||
|
"punct",
|
||||||
|
"sconj",
|
||||||
|
):
|
||||||
|
return self.lookup_lemmatize(token)
|
||||||
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
|
index = index_table.get(univ_pos, {})
|
||||||
|
exceptions = exc_table.get(univ_pos, {})
|
||||||
|
rules = rules_table.get(univ_pos, [])
|
||||||
|
string = string.lower()
|
||||||
|
forms = []
|
||||||
|
if string in index:
|
||||||
|
forms.append(string)
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
||||||
|
forms.extend(exceptions.get(string, []))
|
||||||
|
oov_forms = []
|
||||||
|
if not forms:
|
||||||
|
for old, new in rules:
|
||||||
|
if string.endswith(old):
|
||||||
|
form = string[: len(string) - len(old)] + new
|
||||||
|
if not form:
|
||||||
|
pass
|
||||||
|
elif form in index or not form.isalpha():
|
||||||
|
forms.append(form)
|
||||||
|
else:
|
||||||
|
oov_forms.append(form)
|
||||||
|
if not forms:
|
||||||
|
forms.extend(oov_forms)
|
||||||
|
if not forms and string in lookup_table.keys():
|
||||||
|
forms.append(self.lookup_lemmatize(token)[0])
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
|
forms = list(set(forms))
|
||||||
|
self.cache[cache_key] = forms
|
||||||
|
return forms
|
|
@ -1,12 +1,46 @@
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import CURRENCY
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
from ..char_classes import merge_chars, _units
|
||||||
|
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
||||||
_infixes = TOKENIZER_INFIXES + [
|
_infixes = (
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||||
|
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||||
]
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_units = _units.replace("% ", "")
|
||||||
|
UNITS = merge_chars(_units)
|
||||||
|
|
||||||
|
_suffixes = (
|
||||||
|
LIST_PUNCT
|
||||||
|
+ LIST_ELLIPSES
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [r"-", "—", "–"]
|
||||||
|
+ [
|
||||||
|
r"(?<=[0-9])\+",
|
||||||
|
r"(?<=°[FfCcKk])\.",
|
||||||
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
|
),
|
||||||
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
||||||
|
|
46
spacy/lang/ca/syntax_iterators.py
Normal file
46
spacy/lang/ca/syntax_iterators.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
def noun_chunks(doclike):
|
||||||
|
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||||
|
# fmt: off
|
||||||
|
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||||
|
# fmt: on
|
||||||
|
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||||
|
if not doc.has_annotation("DEP"):
|
||||||
|
raise ValueError(Errors.E029)
|
||||||
|
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||||
|
np_label = doc.vocab.strings.add("NP")
|
||||||
|
prev_end = -1
|
||||||
|
for i, word in enumerate(doclike):
|
||||||
|
if word.pos not in (NOUN, PROPN):
|
||||||
|
continue
|
||||||
|
# Prevent nested chunks from being produced
|
||||||
|
if word.left_edge.i <= prev_end:
|
||||||
|
continue
|
||||||
|
if word.dep in np_deps:
|
||||||
|
left = word.left_edge.i
|
||||||
|
right = word.right_edge.i + 1
|
||||||
|
# leave prepositions and punctuation out of the left side of the chunk
|
||||||
|
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
|
||||||
|
left = word.left_edge.i + 1
|
||||||
|
prev_end = word.right_edge.i
|
||||||
|
# leave subordinated clauses and appositions out of the chunk
|
||||||
|
a = word.i + 1
|
||||||
|
while a < word.right_edge.i:
|
||||||
|
paraula = doc[a]
|
||||||
|
if paraula.pos_ == "VERB":
|
||||||
|
right = paraula.left_edge.i
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
elif paraula.dep_ == "appos":
|
||||||
|
right = paraula.left_edge.i + 1
|
||||||
|
prev_end = paraula.left_edge.i - 1
|
||||||
|
a += 1
|
||||||
|
# leave punctuation out of the right side of the chunk
|
||||||
|
if word.right_edge.pos_ == "PUNCT":
|
||||||
|
right = right - 1
|
||||||
|
yield left, right, np_label
|
||||||
|
|
||||||
|
|
||||||
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -24,6 +24,13 @@ for exc_data in [
|
||||||
{ORTH: "núm", NORM: "número"},
|
{ORTH: "núm", NORM: "número"},
|
||||||
{ORTH: "St.", NORM: "sant"},
|
{ORTH: "St.", NORM: "sant"},
|
||||||
{ORTH: "Sta.", NORM: "santa"},
|
{ORTH: "Sta.", NORM: "santa"},
|
||||||
|
{ORTH: "'l"},
|
||||||
|
{ORTH: "'ls"},
|
||||||
|
{ORTH: "'m"},
|
||||||
|
{ORTH: "'n"},
|
||||||
|
{ORTH: "'ns"},
|
||||||
|
{ORTH: "'s"},
|
||||||
|
{ORTH: "'t"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
|
@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
||||||
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
||||||
|
|
||||||
tokens = ca_tokenizer(text)
|
tokens = ca_tokenizer(text)
|
||||||
assert len(tokens) == 138
|
assert len(tokens) == 140
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,length",
|
"text,length",
|
||||||
[
|
[
|
||||||
("Perquè va anar-hi?", 6),
|
("Perquè va anar-hi?", 4),
|
||||||
("“Ah no?”", 5),
|
("“Ah no?”", 5),
|
||||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||||
("Van córrer aprox. 10km", 5),
|
("Van córrer aprox. 10km", 5),
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ru, uk
|
# excluded: ru, uk
|
||||||
# excluded for custom tables: es, pl
|
# excluded for custom tables: es, pl
|
||||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user