Merge pull request #5097 from mirfan899/master

Basque language support added.
2026-02-02 05:26:01 +03:00 · 2020-03-04 17:20:23 +01:00 · 2020-03-04 17:20:23 +01:00 · 31faab3647
commit 31faab3647
parent 99d8ee506f 224a7f8e94
8 changed files with 331 additions and 0 deletions
--- a/spacy/lang/eu/init.py
+++ b/spacy/lang/eu/init.py
@ -0,0 +1,30 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_SUFFIXES
+from .tag_map import TAG_MAP
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...language import Language
+from ...attrs import LANG
+
+
+class BasqueDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters.update(LEX_ATTRS)
+    lex_attr_getters[LANG] = lambda text: "eu"
+
+    tokenizer_exceptions = BASE_EXCEPTIONS
+    tag_map = TAG_MAP
+    stop_words = STOP_WORDS
+    suffixes = TOKENIZER_SUFFIXES
+
+
+class Basque(Language):
+    lang = "eu"
+    Defaults = BasqueDefaults
+
+
+__all__ = ["Basque"]
--- a/spacy/lang/eu/examples.py
+++ b/spacy/lang/eu/examples.py
@ -0,0 +1,14 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.eu.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
+    "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
+]
--- a/spacy/lang/eu/lex_attrs.py
+++ b/spacy/lang/eu/lex_attrs.py
@ -0,0 +1,80 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+# Source http://mylanguages.org/basque_numbers.php
+
+
+_num_words = """
+bat
+bi
+hiru
+lau
+bost
+sei
+zazpi
+zortzi
+bederatzi
+hamar
+hamaika
+hamabi
+hamahiru
+hamalau
+hamabost
+hamasei
+hamazazpi
+Hemezortzi
+hemeretzi
+hogei
+ehun
+mila
+milioi
+""".split()
+
+# source https://www.google.com/intl/ur/inputtools/try/
+
+_ordinal_words = """
+lehen
+bigarren
+hirugarren
+laugarren
+bosgarren
+seigarren
+zazpigarren
+zortzigarren
+bederatzigarren
+hamargarren
+hamaikagarren
+hamabigarren
+hamahirugarren
+hamalaugarren
+hamabosgarren
+hamaseigarren
+hamazazpigarren
+hamazortzigarren
+hemeretzigarren
+hogeigarren
+behin
+""".split()
+
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    if text in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/eu/punctuation.py
+++ b/spacy/lang/eu/punctuation.py
@ -0,0 +1,7 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_SUFFIXES
+
+
+_suffixes = TOKENIZER_SUFFIXES
--- a/spacy/lang/eu/stop_words.py
+++ b/spacy/lang/eu/stop_words.py
@ -0,0 +1,108 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+# Source: https://github.com/stopwords-iso/stopwords-eu
+# https://www.ranks.nl/stopwords/basque
+# https://www.mustgo.com/worldlanguages/basque/
+STOP_WORDS = set(
+"""
+al
+anitz
+arabera
+asko
+baina
+bat
+batean
+batek
+bati
+batzuei
+batzuek
+batzuetan
+batzuk
+bera
+beraiek
+berau
+berauek
+bere
+berori
+beroriek
+beste
+bezala
+da
+dago
+dira
+ditu
+du
+dute
+edo
+egin
+ere
+eta
+eurak
+ez
+gainera
+gu
+gutxi
+guzti
+haiei
+haiek
+haietan
+hainbeste
+hala
+han
+handik
+hango
+hara
+hari
+hark
+hartan
+hau
+hauei
+hauek
+hauetan
+hemen
+hemendik
+hemengo
+hi
+hona
+honek
+honela
+honetan
+honi
+hor
+hori
+horiei
+horiek
+horietan
+horko
+horra
+horrek
+horrela
+horretan
+horri
+hortik
+hura
+izan
+ni
+noiz
+nola
+non
+nondik
+nongo
+nor
+nora
+ze
+zein
+zen
+zenbait
+zenbat
+zer
+zergatik
+ziren
+zituen
+zu
+zuek
+zuen
+zuten
+""".split()
+)
--- a/spacy/lang/eu/tag_map.py
+++ b/spacy/lang/eu/tag_map.py
@ -0,0 +1,71 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+
+TAG_MAP = {
+    ".": {POS: PUNCT, "PunctType": "peri"},
+    ",": {POS: PUNCT, "PunctType": "comm"},
+    "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
+    "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
+    "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
+    '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
+    ":": {POS: PUNCT},
+    "$": {POS: SYM, "Other": {"SymType": "currency"}},
+    "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
+    "AFX": {POS: ADJ, "Hyph": "yes"},
+    "CC": {POS: CCONJ, "ConjType": "coor"},
+    "CD": {POS: NUM, "NumType": "card"},
+    "DT": {POS: DET},
+    "EX": {POS: ADV, "AdvType": "ex"},
+    "FW": {POS: X, "Foreign": "yes"},
+    "HYPH": {POS: PUNCT, "PunctType": "dash"},
+    "IN": {POS: ADP},
+    "JJ": {POS: ADJ, "Degree": "pos"},
+    "JJR": {POS: ADJ, "Degree": "comp"},
+    "JJS": {POS: ADJ, "Degree": "sup"},
+    "LS": {POS: PUNCT, "NumType": "ord"},
+    "MD": {POS: VERB, "VerbType": "mod"},
+    "NIL": {POS: ""},
+    "NN": {POS: NOUN, "Number": "sing"},
+    "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
+    "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
+    "NNS": {POS: NOUN, "Number": "plur"},
+    "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
+    "POS": {POS: PART, "Poss": "yes"},
+    "PRP": {POS: PRON, "PronType": "prs"},
+    "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
+    "RB": {POS: ADV, "Degree": "pos"},
+    "RBR": {POS: ADV, "Degree": "comp"},
+    "RBS": {POS: ADV, "Degree": "sup"},
+    "RP": {POS: PART},
+    "SP": {POS: SPACE},
+    "SYM": {POS: SYM},
+    "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
+    "UH": {POS: INTJ},
+    "VB": {POS: VERB, "VerbForm": "inf"},
+    "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
+    "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
+    "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
+    "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
+    "VBZ": {
+        POS: VERB,
+        "VerbForm": "fin",
+        "Tense": "pres",
+        "Number": "sing",
+        "Person": 3,
+    },
+    "WDT": {POS: ADJ, "PronType": "int|rel"},
+    "WP": {POS: NOUN, "PronType": "int|rel"},
+    "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
+    "WRB": {POS: ADV, "PronType": "int|rel"},
+    "ADD": {POS: X},
+    "NFP": {POS: PUNCT},
+    "GW": {POS: X},
+    "XX": {POS: X},
+    "BES": {POS: VERB},
+    "HVS": {POS: VERB},
+    "_SP": {POS: SPACE},
+}
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -83,6 +83,11 @@ def es_tokenizer():
    return get_lang_class("es").Defaults.create_tokenizer()


+@pytest.fixture(scope="session")
+def eu_tokenizer():
+    return get_lang_class("eu").Defaults.create_tokenizer()
+
+
@pytest.fixture(scope="session")
 def fi_tokenizer():
    return get_lang_class("fi").Defaults.create_tokenizer()
--- a/spacy/tests/lang/eu/test_text.py
+++ b/spacy/tests/lang/eu/test_text.py
@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_eu_tokenizer_handles_long_text(eu_tokenizer):
+    text = """ta nere guitarra estrenatu ondoren"""
+    tokens = eu_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
+def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
+    tokens = eu_tokenizer(text)
+    assert len(tokens) == length