Basque language added and tested.

2025-09-22 20:16:43 +03:00 · 2020-03-04 11:58:56 +05:00 · 2020-03-04 11:58:56 +05:00 · 03376c9d9b
commit 03376c9d9b
parent 9be90dbca3
8 changed files with 333 additions and 0 deletions
--- a/spacy/lang/eu/init.py
+++ b/spacy/lang/eu/init.py
@ -0,0 +1,30 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .punctuation import TOKENIZER_SUFFIXES
 from .tag_map import TAG_MAP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...attrs import LANG
 class BasqueDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = lambda text: "eu"
    tokenizer_exceptions = BASE_EXCEPTIONS
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    suffixes = TOKENIZER_SUFFIXES
 class Basque(Language):
    lang = "eu"
    Defaults = BasqueDefaults
 __all__ = ["Basque"]
--- a/spacy/lang/eu/examples.py
+++ b/spacy/lang/eu/examples.py
@ -0,0 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.eu.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "",
    ""
 ]
--- a/spacy/lang/eu/lex_attrs.py
+++ b/spacy/lang/eu/lex_attrs.py
@ -0,0 +1,80 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 # Source http://mylanguages.org/basque_numbers.php
 _num_words = """
 bat
 bi
 hiru
 lau
 bost
 sei
 zazpi
 zortzi
 bederatzi
 hamar
 hamaika
 hamabi
 hamahiru
 hamalau
 hamabost
 hamasei
 hamazazpi
 Hemezortzi
 hemeretzi
 hogei
 ehun
 mila
 milioi
 """.split()
 # source https://www.google.com/intl/ur/inputtools/try/
 _ordinal_words = """
 lehen
 bigarren
 hirugarren
 laugarren
 bosgarren
 seigarren
 zazpigarren
 zortzigarren
 bederatzigarren
 hamargarren
 hamaikagarren
 hamabigarren
 hamahirugarren
 hamalaugarren
 hamabosgarren
 hamaseigarren
 hamazazpigarren
 hamazortzigarren
 hemeretzigarren
 hogeigarren
 behin
 """.split()
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text in _num_words:
        return True
    if text in _ordinal_words:
        return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@ -0,0 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..punctuation import TOKENIZER_SUFFIXES
 _suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/eu/stop_words.py
+++ b/spacy/lang/eu/stop_words.py
@ -0,0 +1,108 @@
 # encoding: utf8
 from __future__ import unicode_literals
 # Source: https://github.com/stopwords-iso/stopwords-eu
 # https://www.ranks.nl/stopwords/basque
 # https://www.mustgo.com/worldlanguages/basque/
 STOP_WORDS = set(
 """
 al
 anitz
 arabera
 asko
 baina
 bat
 batean
 batek
 bati
 batzuei
 batzuek
 batzuetan
 batzuk
 bera
 beraiek
 berau
 berauek
 bere
 berori
 beroriek
 beste
 bezala
 da
 dago
 dira
 ditu
 du
 dute
 edo
 egin
 ere
 eta
 eurak
 ez
 gainera
 gu
 gutxi
 guzti
 haiei
 haiek
 haietan
 hainbeste
 hala
 han
 handik
 hango
 hara
 hari
 hark
 hartan
 hau
 hauei
 hauek
 hauetan
 hemen
 hemendik
 hemengo
 hi
 hona
 honek
 honela
 honetan
 honi
 hor
 hori
 horiei
 horiek
 horietan
 horko
 horra
 horrek
 horrela
 horretan
 horri
 hortik
 hura
 izan
 ni
 noiz
 nola
 non
 nondik
 nongo
 nor
 nora
 ze
 zein
 zen
 zenbait
 zenbat
 zer
 zergatik
 ziren
 zituen
 zu
 zuek
 zuen
 zuten
 """.split()
 )
--- a/spacy/lang/eu/tag_map.py
+++ b/spacy/lang/eu/tag_map.py
@ -0,0 +1,71 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
 TAG_MAP = {
    ".": {POS: PUNCT, "PunctType": "peri"},
    ",": {POS: PUNCT, "PunctType": "comm"},
    "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
    "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
    "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
    ":": {POS: PUNCT},
    "$": {POS: SYM, "Other": {"SymType": "currency"}},
    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
    "AFX": {POS: ADJ, "Hyph": "yes"},
    "CC": {POS: CCONJ, "ConjType": "coor"},
    "CD": {POS: NUM, "NumType": "card"},
    "DT": {POS: DET},
    "EX": {POS: ADV, "AdvType": "ex"},
    "FW": {POS: X, "Foreign": "yes"},
    "HYPH": {POS: PUNCT, "PunctType": "dash"},
    "IN": {POS: ADP},
    "JJ": {POS: ADJ, "Degree": "pos"},
    "JJR": {POS: ADJ, "Degree": "comp"},
    "JJS": {POS: ADJ, "Degree": "sup"},
    "LS": {POS: PUNCT, "NumType": "ord"},
    "MD": {POS: VERB, "VerbType": "mod"},
    "NIL": {POS: ""},
    "NN": {POS: NOUN, "Number": "sing"},
    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
    "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
    "NNS": {POS: NOUN, "Number": "plur"},
    "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
    "POS": {POS: PART, "Poss": "yes"},
    "PRP": {POS: PRON, "PronType": "prs"},
    "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
    "RB": {POS: ADV, "Degree": "pos"},
    "RBR": {POS: ADV, "Degree": "comp"},
    "RBS": {POS: ADV, "Degree": "sup"},
    "RP": {POS: PART},
    "SP": {POS: SPACE},
    "SYM": {POS: SYM},
    "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
    "UH": {POS: INTJ},
    "VB": {POS: VERB, "VerbForm": "inf"},
    "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
    "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
    "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
    "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
    "VBZ": {
        POS: VERB,
        "VerbForm": "fin",
        "Tense": "pres",
        "Number": "sing",
        "Person": 3,
    },
    "WDT": {POS: ADJ, "PronType": "int|rel"},
    "WP": {POS: NOUN, "PronType": "int|rel"},
    "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
    "WRB": {POS: ADV, "PronType": "int|rel"},
    "ADD": {POS: X},
    "NFP": {POS: PUNCT},
    "GW": {POS: X},
    "XX": {POS: X},
    "BES": {POS: VERB},
    "HVS": {POS: VERB},
    "_SP": {POS: SPACE},
 }
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -83,6 +83,11 @@ def es_tokenizer():
    return get_lang_class("es").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
 def eu_tokenizer():
    return get_lang_class("eu").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
 def fi_tokenizer():
    return get_lang_class("fi").Defaults.create_tokenizer()
--- a/spacy/tests/lang/eu/test_text.py
+++ b/spacy/tests/lang/eu/test_text.py
@ -0,0 +1,16 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 def test_eu_tokenizer_handles_long_text(eu_tokenizer):
    text = """ta nere guitarra estrenatu ondoren"""
    tokens = eu_tokenizer(text)
    assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
 def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
    tokens = eu_tokenizer(text)
    assert len(tokens) == length