mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Update Catalan language data (#8308)
* Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up
This commit is contained in:
		
							parent
							
								
									d9be9e6cf9
								
							
						
					
					
						commit
						b98d216205
					
				| 
						 | 
				
			
			@ -65,7 +65,7 @@ console_scripts =
 | 
			
		|||
 | 
			
		||||
[options.extras_require]
 | 
			
		||||
lookups =
 | 
			
		||||
    spacy_lookups_data>=1.0.0,<1.1.0
 | 
			
		||||
    spacy_lookups_data>=1.0.1,<1.1.0
 | 
			
		||||
transformers =
 | 
			
		||||
    spacy_transformers>=1.0.1,<1.1.0
 | 
			
		||||
ray =
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,23 @@
 | 
			
		|||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
 | 
			
		||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from .lemmatizer import CatalanLemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CatalanDefaults(Language.Defaults):
 | 
			
		||||
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
			
		||||
    infixes = TOKENIZER_INFIXES
 | 
			
		||||
    suffixes = TOKENIZER_SUFFIXES
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
    lex_attr_getters = LEX_ATTRS
 | 
			
		||||
    syntax_iterators = SYNTAX_ITERATORS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Catalan(Language):
 | 
			
		||||
| 
						 | 
				
			
			@ -17,4 +25,16 @@ class Catalan(Language):
 | 
			
		|||
    Defaults = CatalanDefaults
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@Catalan.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "overwrite": False},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
 | 
			
		||||
):
 | 
			
		||||
    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Catalan"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								spacy/lang/ca/lemmatizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,81 @@
 | 
			
		|||
from typing import List, Tuple
 | 
			
		||||
 | 
			
		||||
from ...pipeline import Lemmatizer
 | 
			
		||||
from ...tokens import Token
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class CatalanLemmatizer(Lemmatizer):
 | 
			
		||||
    """
 | 
			
		||||
    Copied from French Lemmatizer
 | 
			
		||||
    Catalan language lemmatizer applies the default rule based lemmatization
 | 
			
		||||
    procedure with some modifications for better Catalan language support.
 | 
			
		||||
 | 
			
		||||
    The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
 | 
			
		||||
    the rule-based lemmatization. As a last resort, the lemmatizer checks in
 | 
			
		||||
    the lookup table.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
 | 
			
		||||
        if mode == "rule":
 | 
			
		||||
            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
 | 
			
		||||
            return (required, [])
 | 
			
		||||
        else:
 | 
			
		||||
            return super().get_lookups_config(mode)
 | 
			
		||||
 | 
			
		||||
    def rule_lemmatize(self, token: Token) -> List[str]:
 | 
			
		||||
        cache_key = (token.orth, token.pos)
 | 
			
		||||
        if cache_key in self.cache:
 | 
			
		||||
            return self.cache[cache_key]
 | 
			
		||||
        string = token.text
 | 
			
		||||
        univ_pos = token.pos_.lower()
 | 
			
		||||
        if univ_pos in ("", "eol", "space"):
 | 
			
		||||
            return [string.lower()]
 | 
			
		||||
        elif "lemma_rules" not in self.lookups or univ_pos not in (
 | 
			
		||||
            "noun",
 | 
			
		||||
            "verb",
 | 
			
		||||
            "adj",
 | 
			
		||||
            "adp",
 | 
			
		||||
            "adv",
 | 
			
		||||
            "aux",
 | 
			
		||||
            "cconj",
 | 
			
		||||
            "det",
 | 
			
		||||
            "pron",
 | 
			
		||||
            "punct",
 | 
			
		||||
            "sconj",
 | 
			
		||||
        ):
 | 
			
		||||
            return self.lookup_lemmatize(token)
 | 
			
		||||
        index_table = self.lookups.get_table("lemma_index", {})
 | 
			
		||||
        exc_table = self.lookups.get_table("lemma_exc", {})
 | 
			
		||||
        rules_table = self.lookups.get_table("lemma_rules", {})
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
        index = index_table.get(univ_pos, {})
 | 
			
		||||
        exceptions = exc_table.get(univ_pos, {})
 | 
			
		||||
        rules = rules_table.get(univ_pos, [])
 | 
			
		||||
        string = string.lower()
 | 
			
		||||
        forms = []
 | 
			
		||||
        if string in index:
 | 
			
		||||
            forms.append(string)
 | 
			
		||||
            self.cache[cache_key] = forms
 | 
			
		||||
            return forms
 | 
			
		||||
        forms.extend(exceptions.get(string, []))
 | 
			
		||||
        oov_forms = []
 | 
			
		||||
        if not forms:
 | 
			
		||||
            for old, new in rules:
 | 
			
		||||
                if string.endswith(old):
 | 
			
		||||
                    form = string[: len(string) - len(old)] + new
 | 
			
		||||
                    if not form:
 | 
			
		||||
                        pass
 | 
			
		||||
                    elif form in index or not form.isalpha():
 | 
			
		||||
                        forms.append(form)
 | 
			
		||||
                    else:
 | 
			
		||||
                        oov_forms.append(form)
 | 
			
		||||
        if not forms:
 | 
			
		||||
            forms.extend(oov_forms)
 | 
			
		||||
        if not forms and string in lookup_table.keys():
 | 
			
		||||
            forms.append(self.lookup_lemmatize(token)[0])
 | 
			
		||||
        if not forms:
 | 
			
		||||
            forms.append(string)
 | 
			
		||||
        forms = list(set(forms))
 | 
			
		||||
        self.cache[cache_key] = forms
 | 
			
		||||
        return forms
 | 
			
		||||
| 
						 | 
				
			
			@ -1,12 +1,46 @@
 | 
			
		|||
from ..punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from ..char_classes import ALPHA
 | 
			
		||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
			
		||||
from ..char_classes import CURRENCY
 | 
			
		||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
			
		||||
from ..char_classes import merge_chars, _units
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_infixes = TOKENIZER_INFIXES + [
 | 
			
		||||
    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
 | 
			
		||||
_infixes = (
 | 
			
		||||
    LIST_ELLIPSES
 | 
			
		||||
    + LIST_ICONS
 | 
			
		||||
    + [
 | 
			
		||||
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
 | 
			
		||||
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
 | 
			
		||||
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
 | 
			
		||||
        ),
 | 
			
		||||
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
			
		||||
        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
 | 
			
		||||
    ]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
_units = _units.replace("% ", "")
 | 
			
		||||
UNITS = merge_chars(_units)
 | 
			
		||||
 | 
			
		||||
_suffixes = (
 | 
			
		||||
    LIST_PUNCT
 | 
			
		||||
    + LIST_ELLIPSES
 | 
			
		||||
    + LIST_QUOTES
 | 
			
		||||
    + LIST_ICONS
 | 
			
		||||
    + [r"-", "—", "–"]
 | 
			
		||||
    + [
 | 
			
		||||
        r"(?<=[0-9])\+",
 | 
			
		||||
        r"(?<=°[FfCcKk])\.",
 | 
			
		||||
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
 | 
			
		||||
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
 | 
			
		||||
        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
 | 
			
		||||
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
 | 
			
		||||
        ),
 | 
			
		||||
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
 | 
			
		||||
    ]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
TOKENIZER_INFIXES = _infixes
 | 
			
		||||
TOKENIZER_SUFFIXES = _suffixes
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								spacy/lang/ca/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,46 @@
 | 
			
		|||
from ...symbols import NOUN, PROPN
 | 
			
		||||
from ...errors import Errors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def noun_chunks(doclike):
 | 
			
		||||
    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
 | 
			
		||||
    # fmt: off
 | 
			
		||||
    labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
 | 
			
		||||
    # fmt: on
 | 
			
		||||
    doc = doclike.doc  # Ensure works on both Doc and Span.
 | 
			
		||||
    if not doc.has_annotation("DEP"):
 | 
			
		||||
        raise ValueError(Errors.E029)
 | 
			
		||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
			
		||||
    np_label = doc.vocab.strings.add("NP")
 | 
			
		||||
    prev_end = -1
 | 
			
		||||
    for i, word in enumerate(doclike):
 | 
			
		||||
        if word.pos not in (NOUN, PROPN):
 | 
			
		||||
            continue
 | 
			
		||||
        # Prevent nested chunks from being produced
 | 
			
		||||
        if word.left_edge.i <= prev_end:
 | 
			
		||||
            continue
 | 
			
		||||
        if word.dep in np_deps:
 | 
			
		||||
            left = word.left_edge.i
 | 
			
		||||
            right = word.right_edge.i + 1
 | 
			
		||||
            # leave prepositions and punctuation out of the left side of the chunk
 | 
			
		||||
            if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
 | 
			
		||||
                left = word.left_edge.i + 1
 | 
			
		||||
            prev_end = word.right_edge.i
 | 
			
		||||
            # leave subordinated clauses and appositions out of the chunk
 | 
			
		||||
            a = word.i + 1
 | 
			
		||||
            while a < word.right_edge.i:
 | 
			
		||||
                paraula = doc[a]
 | 
			
		||||
                if paraula.pos_ == "VERB":
 | 
			
		||||
                    right = paraula.left_edge.i
 | 
			
		||||
                    prev_end = paraula.left_edge.i - 1
 | 
			
		||||
                elif paraula.dep_ == "appos":
 | 
			
		||||
                    right = paraula.left_edge.i + 1
 | 
			
		||||
                    prev_end = paraula.left_edge.i - 1
 | 
			
		||||
                a += 1
 | 
			
		||||
            # leave punctuation out of the right side of the chunk
 | 
			
		||||
            if word.right_edge.pos_ == "PUNCT":
 | 
			
		||||
                right = right - 1
 | 
			
		||||
            yield left, right, np_label
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
 | 
			
		||||
| 
						 | 
				
			
			@ -24,6 +24,13 @@ for exc_data in [
 | 
			
		|||
    {ORTH: "núm", NORM: "número"},
 | 
			
		||||
    {ORTH: "St.", NORM: "sant"},
 | 
			
		||||
    {ORTH: "Sta.", NORM: "santa"},
 | 
			
		||||
    {ORTH: "'l"},
 | 
			
		||||
    {ORTH: "'ls"},
 | 
			
		||||
    {ORTH: "'m"},
 | 
			
		||||
    {ORTH: "'n"},
 | 
			
		||||
    {ORTH: "'ns"},
 | 
			
		||||
    {ORTH: "'s"},
 | 
			
		||||
    {ORTH: "'t"},
 | 
			
		||||
]:
 | 
			
		||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
 | 
			
		|||
    una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 | 
			
		||||
 | 
			
		||||
    tokens = ca_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 138
 | 
			
		||||
    assert len(tokens) == 140
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text,length",
 | 
			
		||||
    [
 | 
			
		||||
        ("Perquè va anar-hi?", 6),
 | 
			
		||||
        ("Perquè va anar-hi?", 4),
 | 
			
		||||
        ("“Ah no?”", 5),
 | 
			
		||||
        ("""Sí! "Anem", va contestar el Joan Carles""", 11),
 | 
			
		||||
        ("Van córrer aprox. 10km", 5),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 | 
			
		|||
# Only include languages with no external dependencies
 | 
			
		||||
# excluded: ru, uk
 | 
			
		||||
# excluded for custom tables: es, pl
 | 
			
		||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 | 
			
		||||
LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 | 
			
		||||
# fmt: on
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user