Merge pull request #5097 from mirfan899/master

Basque language support added.
This commit is contained in:
Ines Montani 2020-03-04 17:20:23 +01:00 committed by GitHub
commit 31faab3647
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 331 additions and 0 deletions

30
spacy/lang/eu/__init__.py Normal file
View File

@ -0,0 +1,30 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES
from .tag_map import TAG_MAP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
class BasqueDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "eu"
tokenizer_exceptions = BASE_EXCEPTIONS
tag_map = TAG_MAP
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
class Basque(Language):
lang = "eu"
Defaults = BasqueDefaults
__all__ = ["Basque"]

14
spacy/lang/eu/examples.py Normal file
View File

@ -0,0 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.eu.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
]

View File

@ -0,0 +1,80 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
# Source http://mylanguages.org/basque_numbers.php
_num_words = """
bat
bi
hiru
lau
bost
sei
zazpi
zortzi
bederatzi
hamar
hamaika
hamabi
hamahiru
hamalau
hamabost
hamasei
hamazazpi
Hemezortzi
hemeretzi
hogei
ehun
mila
milioi
""".split()
# source https://www.google.com/intl/ur/inputtools/try/
_ordinal_words = """
lehen
bigarren
hirugarren
laugarren
bosgarren
seigarren
zazpigarren
zortzigarren
bederatzigarren
hamargarren
hamaikagarren
hamabigarren
hamahirugarren
hamalaugarren
hamabosgarren
hamaseigarren
hamazazpigarren
hamazortzigarren
hemeretzigarren
hogeigarren
behin
""".split()
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
if text in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_SUFFIXES
_suffixes = TOKENIZER_SUFFIXES

108
spacy/lang/eu/stop_words.py Normal file
View File

@ -0,0 +1,108 @@
# encoding: utf8
from __future__ import unicode_literals
# Source: https://github.com/stopwords-iso/stopwords-eu
# https://www.ranks.nl/stopwords/basque
# https://www.mustgo.com/worldlanguages/basque/
STOP_WORDS = set(
"""
al
anitz
arabera
asko
baina
bat
batean
batek
bati
batzuei
batzuek
batzuetan
batzuk
bera
beraiek
berau
berauek
bere
berori
beroriek
beste
bezala
da
dago
dira
ditu
du
dute
edo
egin
ere
eta
eurak
ez
gainera
gu
gutxi
guzti
haiei
haiek
haietan
hainbeste
hala
han
handik
hango
hara
hari
hark
hartan
hau
hauei
hauek
hauetan
hemen
hemendik
hemengo
hi
hona
honek
honela
honetan
honi
hor
hori
horiei
horiek
horietan
horko
horra
horrek
horrela
horretan
horri
hortik
hura
izan
ni
noiz
nola
non
nondik
nongo
nor
nora
ze
zein
zen
zenbait
zenbat
zer
zergatik
ziren
zituen
zu
zuek
zuen
zuten
""".split()
)

71
spacy/lang/eu/tag_map.py Normal file
View File

@ -0,0 +1,71 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
TAG_MAP = {
".": {POS: PUNCT, "PunctType": "peri"},
",": {POS: PUNCT, "PunctType": "comm"},
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT},
"$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CCONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"},
"FW": {POS: X, "Foreign": "yes"},
"HYPH": {POS: PUNCT, "PunctType": "dash"},
"IN": {POS: ADP},
"JJ": {POS: ADJ, "Degree": "pos"},
"JJR": {POS: ADJ, "Degree": "comp"},
"JJS": {POS: ADJ, "Degree": "sup"},
"LS": {POS: PUNCT, "NumType": "ord"},
"MD": {POS: VERB, "VerbType": "mod"},
"NIL": {POS: ""},
"NN": {POS: NOUN, "Number": "sing"},
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
"NNS": {POS: NOUN, "Number": "plur"},
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
"POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"},
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
"RB": {POS: ADV, "Degree": "pos"},
"RBR": {POS: ADV, "Degree": "comp"},
"RBS": {POS: ADV, "Degree": "sup"},
"RP": {POS: PART},
"SP": {POS: SPACE},
"SYM": {POS: SYM},
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
"UH": {POS: INTJ},
"VB": {POS: VERB, "VerbForm": "inf"},
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
"VBZ": {
POS: VERB,
"VerbForm": "fin",
"Tense": "pres",
"Number": "sing",
"Person": 3,
},
"WDT": {POS: ADJ, "PronType": "int|rel"},
"WP": {POS: NOUN, "PronType": "int|rel"},
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
"WRB": {POS: ADV, "PronType": "int|rel"},
"ADD": {POS: X},
"NFP": {POS: PUNCT},
"GW": {POS: X},
"XX": {POS: X},
"BES": {POS: VERB},
"HVS": {POS: VERB},
"_SP": {POS: SPACE},
}

View File

@ -83,6 +83,11 @@ def es_tokenizer():
return get_lang_class("es").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def eu_tokenizer():
return get_lang_class("eu").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def fi_tokenizer():
return get_lang_class("fi").Defaults.create_tokenizer()

View File

@ -0,0 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
def test_eu_tokenizer_handles_long_text(eu_tokenizer):
text = """ta nere guitarra estrenatu ondoren"""
tokens = eu_tokenizer(text)
assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
tokens = eu_tokenizer(text)
assert len(tokens) == length