diff --git a/spacy/lang/eu/__init__.py b/spacy/lang/eu/__init__.py new file mode 100644 index 000000000..4f3338c1d --- /dev/null +++ b/spacy/lang/eu/__init__.py @@ -0,0 +1,30 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES +from .tag_map import TAG_MAP + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG + + +class BasqueDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "eu" + + tokenizer_exceptions = BASE_EXCEPTIONS + tag_map = TAG_MAP + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + + +class Basque(Language): + lang = "eu" + Defaults = BasqueDefaults + + +__all__ = ["Basque"] diff --git a/spacy/lang/eu/examples.py b/spacy/lang/eu/examples.py new file mode 100644 index 000000000..f2d325d78 --- /dev/null +++ b/spacy/lang/eu/examples.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.eu.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du", + "gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira" +] diff --git a/spacy/lang/eu/lex_attrs.py b/spacy/lang/eu/lex_attrs.py new file mode 100644 index 000000000..c11e913db --- /dev/null +++ b/spacy/lang/eu/lex_attrs.py @@ -0,0 +1,80 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +# Source http://mylanguages.org/basque_numbers.php + + +_num_words = """ +bat +bi +hiru +lau +bost +sei +zazpi +zortzi +bederatzi +hamar +hamaika +hamabi +hamahiru +hamalau +hamabost +hamasei +hamazazpi +Hemezortzi +hemeretzi +hogei +ehun +mila +milioi +""".split() + +# source https://www.google.com/intl/ur/inputtools/try/ + +_ordinal_words = """ +lehen +bigarren +hirugarren +laugarren +bosgarren +seigarren +zazpigarren +zortzigarren +bederatzigarren +hamargarren +hamaikagarren +hamabigarren +hamahirugarren +hamalaugarren +hamabosgarren +hamaseigarren +hamazazpigarren +hamazortzigarren +hemeretzigarren +hogeigarren +behin +""".split() + + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/eu/punctuation.py b/spacy/lang/eu/punctuation.py new file mode 100644 index 000000000..b8b1a1c83 --- /dev/null +++ b/spacy/lang/eu/punctuation.py @@ -0,0 +1,7 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_SUFFIXES + + +_suffixes = TOKENIZER_SUFFIXES diff --git a/spacy/lang/eu/stop_words.py b/spacy/lang/eu/stop_words.py new file mode 100644 index 000000000..208238961 --- /dev/null +++ b/spacy/lang/eu/stop_words.py @@ -0,0 +1,108 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Source: https://github.com/stopwords-iso/stopwords-eu +# https://www.ranks.nl/stopwords/basque +# https://www.mustgo.com/worldlanguages/basque/ +STOP_WORDS = set( +""" +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten +""".split() +) diff --git a/spacy/lang/eu/tag_map.py b/spacy/lang/eu/tag_map.py new file mode 100644 index 000000000..2499d7e3e --- /dev/null +++ b/spacy/lang/eu/tag_map.py @@ -0,0 +1,71 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "$": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CCONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SP": {POS: SPACE}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": { + POS: VERB, + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": 3, + }, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "ADD": {POS: X}, + "NFP": {POS: PUNCT}, + "GW": {POS: X}, + "XX": {POS: X}, + "BES": {POS: VERB}, + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..fc89c2658 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -83,6 +83,11 @@ def es_tokenizer(): return get_lang_class("es").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def eu_tokenizer(): + return get_lang_class("eu").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def fi_tokenizer(): return get_lang_class("fi").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/eu/test_text.py b/spacy/tests/lang/eu/test_text.py new file mode 100644 index 000000000..e73917ffa --- /dev/null +++ b/spacy/tests/lang/eu/test_text.py @@ -0,0 +1,16 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_eu_tokenizer_handles_long_text(eu_tokenizer): + text = """ta nere guitarra estrenatu ondoren""" + tokens = eu_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)]) +def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length): + tokens = eu_tokenizer(text) + assert len(tokens) == length