mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge pull request #5097 from mirfan899/master
Basque language support added.
This commit is contained in:
commit
31faab3647
30
spacy/lang/eu/__init__.py
Normal file
30
spacy/lang/eu/__init__.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
||||
|
||||
class BasqueDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "eu"
|
||||
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class Basque(Language):
|
||||
lang = "eu"
|
||||
Defaults = BasqueDefaults
|
||||
|
||||
|
||||
__all__ = ["Basque"]
|
14
spacy/lang/eu/examples.py
Normal file
14
spacy/lang/eu/examples.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.eu.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
sentences = [
|
||||
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
|
||||
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
|
||||
]
|
80
spacy/lang/eu/lex_attrs.py
Normal file
80
spacy/lang/eu/lex_attrs.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
# Source http://mylanguages.org/basque_numbers.php
|
||||
|
||||
|
||||
_num_words = """
|
||||
bat
|
||||
bi
|
||||
hiru
|
||||
lau
|
||||
bost
|
||||
sei
|
||||
zazpi
|
||||
zortzi
|
||||
bederatzi
|
||||
hamar
|
||||
hamaika
|
||||
hamabi
|
||||
hamahiru
|
||||
hamalau
|
||||
hamabost
|
||||
hamasei
|
||||
hamazazpi
|
||||
Hemezortzi
|
||||
hemeretzi
|
||||
hogei
|
||||
ehun
|
||||
mila
|
||||
milioi
|
||||
""".split()
|
||||
|
||||
# source https://www.google.com/intl/ur/inputtools/try/
|
||||
|
||||
_ordinal_words = """
|
||||
lehen
|
||||
bigarren
|
||||
hirugarren
|
||||
laugarren
|
||||
bosgarren
|
||||
seigarren
|
||||
zazpigarren
|
||||
zortzigarren
|
||||
bederatzigarren
|
||||
hamargarren
|
||||
hamaikagarren
|
||||
hamabigarren
|
||||
hamahirugarren
|
||||
hamalaugarren
|
||||
hamabosgarren
|
||||
hamaseigarren
|
||||
hamazazpigarren
|
||||
hamazortzigarren
|
||||
hemeretzigarren
|
||||
hogeigarren
|
||||
behin
|
||||
""".split()
|
||||
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
if text in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
7
spacy/lang/eu/punctuation.py
Normal file
7
spacy/lang/eu/punctuation.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_suffixes = TOKENIZER_SUFFIXES
|
108
spacy/lang/eu/stop_words.py
Normal file
108
spacy/lang/eu/stop_words.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-eu
|
||||
# https://www.ranks.nl/stopwords/basque
|
||||
# https://www.mustgo.com/worldlanguages/basque/
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
al
|
||||
anitz
|
||||
arabera
|
||||
asko
|
||||
baina
|
||||
bat
|
||||
batean
|
||||
batek
|
||||
bati
|
||||
batzuei
|
||||
batzuek
|
||||
batzuetan
|
||||
batzuk
|
||||
bera
|
||||
beraiek
|
||||
berau
|
||||
berauek
|
||||
bere
|
||||
berori
|
||||
beroriek
|
||||
beste
|
||||
bezala
|
||||
da
|
||||
dago
|
||||
dira
|
||||
ditu
|
||||
du
|
||||
dute
|
||||
edo
|
||||
egin
|
||||
ere
|
||||
eta
|
||||
eurak
|
||||
ez
|
||||
gainera
|
||||
gu
|
||||
gutxi
|
||||
guzti
|
||||
haiei
|
||||
haiek
|
||||
haietan
|
||||
hainbeste
|
||||
hala
|
||||
han
|
||||
handik
|
||||
hango
|
||||
hara
|
||||
hari
|
||||
hark
|
||||
hartan
|
||||
hau
|
||||
hauei
|
||||
hauek
|
||||
hauetan
|
||||
hemen
|
||||
hemendik
|
||||
hemengo
|
||||
hi
|
||||
hona
|
||||
honek
|
||||
honela
|
||||
honetan
|
||||
honi
|
||||
hor
|
||||
hori
|
||||
horiei
|
||||
horiek
|
||||
horietan
|
||||
horko
|
||||
horra
|
||||
horrek
|
||||
horrela
|
||||
horretan
|
||||
horri
|
||||
hortik
|
||||
hura
|
||||
izan
|
||||
ni
|
||||
noiz
|
||||
nola
|
||||
non
|
||||
nondik
|
||||
nongo
|
||||
nor
|
||||
nora
|
||||
ze
|
||||
zein
|
||||
zen
|
||||
zenbait
|
||||
zenbat
|
||||
zer
|
||||
zergatik
|
||||
ziren
|
||||
zituen
|
||||
zu
|
||||
zuek
|
||||
zuen
|
||||
zuten
|
||||
""".split()
|
||||
)
|
71
spacy/lang/eu/tag_map.py
Normal file
71
spacy/lang/eu/tag_map.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
|
||||
|
||||
TAG_MAP = {
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CCONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"SP": {POS: SPACE},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {
|
||||
POS: VERB,
|
||||
"VerbForm": "fin",
|
||||
"Tense": "pres",
|
||||
"Number": "sing",
|
||||
"Person": 3,
|
||||
},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
"GW": {POS: X},
|
||||
"XX": {POS: X},
|
||||
"BES": {POS: VERB},
|
||||
"HVS": {POS: VERB},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
|
@ -83,6 +83,11 @@ def es_tokenizer():
|
|||
return get_lang_class("es").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def eu_tokenizer():
|
||||
return get_lang_class("eu").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fi_tokenizer():
|
||||
return get_lang_class("fi").Defaults.create_tokenizer()
|
||||
|
|
16
spacy/tests/lang/eu/test_text.py
Normal file
16
spacy/tests/lang/eu/test_text.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_eu_tokenizer_handles_long_text(eu_tokenizer):
|
||||
text = """ta nere guitarra estrenatu ondoren"""
|
||||
tokens = eu_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
|
||||
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
|
||||
tokens = eu_tokenizer(text)
|
||||
assert len(tokens) == length
|
Loading…
Reference in New Issue
Block a user