mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update Catalan language data (#8308)
* Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up
This commit is contained in:
parent
d9be9e6cf9
commit
b98d216205
|
@ -65,7 +65,7 @@ console_scripts =
|
|||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
spacy_lookups_data>=1.0.0,<1.1.0
|
||||
spacy_lookups_data>=1.0.1,<1.1.0
|
||||
transformers =
|
||||
spacy_transformers>=1.0.1,<1.1.0
|
||||
ray =
|
||||
|
|
|
@ -1,15 +1,23 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from .lemmatizer import CatalanLemmatizer
|
||||
|
||||
|
||||
class CatalanDefaults(Language.Defaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Catalan(Language):
|
||||
|
@ -17,4 +25,16 @@ class Catalan(Language):
|
|||
Defaults = CatalanDefaults
|
||||
|
||||
|
||||
@Catalan.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
|
||||
):
|
||||
return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
|
||||
__all__ = ["Catalan"]
|
||||
|
|
81
spacy/lang/ca/lemmatizer.py
Normal file
81
spacy/lang/ca/lemmatizer.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
from typing import List, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
||||
class CatalanLemmatizer(Lemmatizer):
|
||||
"""
|
||||
Copied from French Lemmatizer
|
||||
Catalan language lemmatizer applies the default rule based lemmatization
|
||||
procedure with some modifications for better Catalan language support.
|
||||
|
||||
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
||||
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
||||
the lookup table.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
if mode == "rule":
|
||||
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
||||
return (required, [])
|
||||
else:
|
||||
return super().get_lookups_config(mode)
|
||||
|
||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||
cache_key = (token.orth, token.pos)
|
||||
if cache_key in self.cache:
|
||||
return self.cache[cache_key]
|
||||
string = token.text
|
||||
univ_pos = token.pos_.lower()
|
||||
if univ_pos in ("", "eol", "space"):
|
||||
return [string.lower()]
|
||||
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
||||
"noun",
|
||||
"verb",
|
||||
"adj",
|
||||
"adp",
|
||||
"adv",
|
||||
"aux",
|
||||
"cconj",
|
||||
"det",
|
||||
"pron",
|
||||
"punct",
|
||||
"sconj",
|
||||
):
|
||||
return self.lookup_lemmatize(token)
|
||||
index_table = self.lookups.get_table("lemma_index", {})
|
||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
index = index_table.get(univ_pos, {})
|
||||
exceptions = exc_table.get(univ_pos, {})
|
||||
rules = rules_table.get(univ_pos, [])
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
if not forms:
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[: len(string) - len(old)] + new
|
||||
if not form:
|
||||
pass
|
||||
elif form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.extend(oov_forms)
|
||||
if not forms and string in lookup_table.keys():
|
||||
forms.append(self.lookup_lemmatize(token)[0])
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms = list(set(forms))
|
||||
self.cache[cache_key] = forms
|
||||
return forms
|
|
@ -1,12 +1,46 @@
|
|||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import CURRENCY
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
from ..char_classes import merge_chars, _units
|
||||
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
|
||||
]
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
)
|
||||
|
||||
_units = _units.replace("% ", "")
|
||||
UNITS = merge_chars(_units)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [r"-", "—", "–"]
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
|
|
46
spacy/lang/ca/syntax_iterators.py
Normal file
46
spacy/lang/ca/syntax_iterators.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
from ...symbols import NOUN, PROPN
|
||||
from ...errors import Errors
|
||||
|
||||
|
||||
def noun_chunks(doclike):
|
||||
"""Detect base noun phrases from a dependency parse. Works on Doc and Span."""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
left = word.left_edge.i
|
||||
right = word.right_edge.i + 1
|
||||
# leave prepositions and punctuation out of the left side of the chunk
|
||||
if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
|
||||
left = word.left_edge.i + 1
|
||||
prev_end = word.right_edge.i
|
||||
# leave subordinated clauses and appositions out of the chunk
|
||||
a = word.i + 1
|
||||
while a < word.right_edge.i:
|
||||
paraula = doc[a]
|
||||
if paraula.pos_ == "VERB":
|
||||
right = paraula.left_edge.i
|
||||
prev_end = paraula.left_edge.i - 1
|
||||
elif paraula.dep_ == "appos":
|
||||
right = paraula.left_edge.i + 1
|
||||
prev_end = paraula.left_edge.i - 1
|
||||
a += 1
|
||||
# leave punctuation out of the right side of the chunk
|
||||
if word.right_edge.pos_ == "PUNCT":
|
||||
right = right - 1
|
||||
yield left, right, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -24,6 +24,13 @@ for exc_data in [
|
|||
{ORTH: "núm", NORM: "número"},
|
||||
{ORTH: "St.", NORM: "sant"},
|
||||
{ORTH: "Sta.", NORM: "santa"},
|
||||
{ORTH: "'l"},
|
||||
{ORTH: "'ls"},
|
||||
{ORTH: "'m"},
|
||||
{ORTH: "'n"},
|
||||
{ORTH: "'ns"},
|
||||
{ORTH: "'s"},
|
||||
{ORTH: "'t"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
|
|
@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
|
|||
una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
|
||||
|
||||
tokens = ca_tokenizer(text)
|
||||
assert len(tokens) == 138
|
||||
assert len(tokens) == 140
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[
|
||||
("Perquè va anar-hi?", 6),
|
||||
("Perquè va anar-hi?", 4),
|
||||
("“Ah no?”", 5),
|
||||
("""Sí! "Anem", va contestar el Joan Carles""", 11),
|
||||
("Van córrer aprox. 10km", 5),
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.util import get_lang_class
|
|||
# Only include languages with no external dependencies
|
||||
# excluded: ru, uk
|
||||
# excluded for custom tables: es, pl
|
||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user