From 6dbf4f7ad7a7e844cfbd7e3e8ff4b8e21bf0f1d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 20:02:41 +0100 Subject: [PATCH] Stub out support for French, Spanish, Italian and Portuguese --- spacy/es/__init__.py | 26 +++ spacy/es/language_data.py | 353 ++++++++++++++++++++++++++++++++++++++ spacy/fr/__init__.py | 27 +++ spacy/fr/language_data.py | 353 ++++++++++++++++++++++++++++++++++++++ spacy/pt/__init__.py | 27 +++ spacy/pt/language_data.py | 353 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 1139 insertions(+) create mode 100644 spacy/es/__init__.py create mode 100644 spacy/es/language_data.py create mode 100644 spacy/fr/__init__.py create mode 100644 spacy/fr/language_data.py create mode 100644 spacy/pt/__init__.py create mode 100644 spacy/pt/language_data.py diff --git a/spacy/es/__init__.py b/spacy/es/__init__.py new file mode 100644 index 000000000..7655e7985 --- /dev/null +++ b/spacy/es/__init__.py @@ -0,0 +1,26 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from . import language_data + + +class Spanish(Language): + lang = 'es' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'es' + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py new file mode 100644 index 000000000..291492957 --- /dev/null +++ b/spacy/es/language_data.py @@ -0,0 +1,353 @@ +# encoding: utf8 +from __future__ import unicode_literals +import re + + +STOP_WORDS = set() + + +TOKENIZER_PREFIXES = map(re.escape, r''' +, +" +( +[ +{ +* +< +> +$ +£ +„ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... +‚ +» +_ +§ +'''.strip().split('\n')) + + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +“ +« +_ +'' +'s +'S +’s +’S +’ +‘ +° +€ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + + +TOKENIZER_INFIXES = tuple() + + +TOKENIZER_EXCEPTIONS = { + "vs.": [{"F": "vs."}], + + "''": [{"F": "''"}], + "—": [{"F": "—", "L": "--", "pos": ":"}], + + "a.m.": [{"F": "a.m."}], + "p.m.": [{"F": "p.m."}], + + "1a.m.": [{"F": "1"}, {"F": "a.m."}], + "2a.m.": [{"F": "2"}, {"F": "a.m."}], + "3a.m.": [{"F": "3"}, {"F": "a.m."}], + "4a.m.": [{"F": "4"}, {"F": "a.m."}], + "5a.m.": [{"F": "5"}, {"F": "a.m."}], + "6a.m.": [{"F": "6"}, {"F": "a.m."}], + "7a.m.": [{"F": "7"}, {"F": "a.m."}], + "8a.m.": [{"F": "8"}, {"F": "a.m."}], + "9a.m.": [{"F": "9"}, {"F": "a.m."}], + "10a.m.": [{"F": "10"}, {"F": "a.m."}], + "11a.m.": [{"F": "11"}, {"F": "a.m."}], + "12a.m.": [{"F": "12"}, {"F": "a.m."}], + "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], + "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], + "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], + "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], + "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], + "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], + "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], + "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], + "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], + "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], + "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], + "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + "p.m.": [{"F": "p.m."}], + "1p.m.": [{"F": "1"}, {"F": "p.m."}], + "2p.m.": [{"F": "2"}, {"F": "p.m."}], + "3p.m.": [{"F": "3"}, {"F": "p.m."}], + "4p.m.": [{"F": "4"}, {"F": "p.m."}], + "5p.m.": [{"F": "5"}, {"F": "p.m."}], + "6p.m.": [{"F": "6"}, {"F": "p.m."}], + "7p.m.": [{"F": "7"}, {"F": "p.m."}], + "8p.m.": [{"F": "8"}, {"F": "p.m."}], + "9p.m.": [{"F": "9"}, {"F": "p.m."}], + "10p.m.": [{"F": "10"}, {"F": "p.m."}], + "11p.m.": [{"F": "11"}, {"F": "p.m."}], + "12p.m.": [{"F": "12"}, {"F": "p.m."}], + "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], + "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], + "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], + "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], + "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], + "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], + "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], + "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], + "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], + "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], + "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], + "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + + "Ala.": [{"F": "Ala."}], + "Ariz.": [{"F": "Ariz."}], + "Ark.": [{"F": "Ark."}], + "Calif.": [{"F": "Calif."}], + "Colo.": [{"F": "Colo."}], + "Conn.": [{"F": "Conn."}], + "Del.": [{"F": "Del."}], + "D.C.": [{"F": "D.C."}], + "Fla.": [{"F": "Fla."}], + "Ga.": [{"F": "Ga."}], + "Ill.": [{"F": "Ill."}], + "Ind.": [{"F": "Ind."}], + "Kans.": [{"F": "Kans."}], + "Kan.": [{"F": "Kan."}], + "Ky.": [{"F": "Ky."}], + "La.": [{"F": "La."}], + "Md.": [{"F": "Md."}], + "Mass.": [{"F": "Mass."}], + "Mich.": [{"F": "Mich."}], + "Minn.": [{"F": "Minn."}], + "Miss.": [{"F": "Miss."}], + "Mo.": [{"F": "Mo."}], + "Mont.": [{"F": "Mont."}], + "Nebr.": [{"F": "Nebr."}], + "Neb.": [{"F": "Neb."}], + "Nev.": [{"F": "Nev."}], + "N.H.": [{"F": "N.H."}], + "N.J.": [{"F": "N.J."}], + "N.M.": [{"F": "N.M."}], + "N.Y.": [{"F": "N.Y."}], + "N.C.": [{"F": "N.C."}], + "N.D.": [{"F": "N.D."}], + "Okla.": [{"F": "Okla."}], + "Ore.": [{"F": "Ore."}], + "Pa.": [{"F": "Pa."}], + "Tenn.": [{"F": "Tenn."}], + "Va.": [{"F": "Va."}], + "Wash.": [{"F": "Wash."}], + "Wis.": [{"F": "Wis."}], + + ":)": [{"F": ":)"}], + "<3": [{"F": "<3"}], + ";)": [{"F": ";)"}], + "(:": [{"F": "(:"}], + ":(": [{"F": ":("}], + "-_-": [{"F": "-_-"}], + "=)": [{"F": "=)"}], + ":/": [{"F": ":/"}], + ":>": [{"F": ":>"}], + ";-)": [{"F": ";-)"}], + ":Y": [{"F": ":Y"}], + ":P": [{"F": ":P"}], + ":-P": [{"F": ":-P"}], + ":3": [{"F": ":3"}], + "=3": [{"F": "=3"}], + "xD": [{"F": "xD"}], + "^_^": [{"F": "^_^"}], + "=]": [{"F": "=]"}], + "=D": [{"F": "=D"}], + "<333": [{"F": "<333"}], + ":))": [{"F": ":))"}], + ":0": [{"F": ":0"}], + "-__-": [{"F": "-__-"}], + "xDD": [{"F": "xDD"}], + "o_o": [{"F": "o_o"}], + "o_O": [{"F": "o_O"}], + "V_V": [{"F": "V_V"}], + "=[[": [{"F": "=[["}], + "<33": [{"F": "<33"}], + ";p": [{"F": ";p"}], + ";D": [{"F": ";D"}], + ";-p": [{"F": ";-p"}], + ";(": [{"F": ";("}], + ":p": [{"F": ":p"}], + ":]": [{"F": ":]"}], + ":O": [{"F": ":O"}], + ":-/": [{"F": ":-/"}], + ":-)": [{"F": ":-)"}], + ":(((": [{"F": ":((("}], + ":((": [{"F": ":(("}], + ":')": [{"F": ":')"}], + "(^_^)": [{"F": "(^_^)"}], + "(=": [{"F": "(="}], + "o.O": [{"F": "o.O"}], + "\")": [{"F": "\")"}], + + "a.": [{"F": "a."}], + "b.": [{"F": "b."}], + "c.": [{"F": "c."}], + "d.": [{"F": "d."}], + "e.": [{"F": "e."}], + "f.": [{"F": "f."}], + "g.": [{"F": "g."}], + "h.": [{"F": "h."}], + "i.": [{"F": "i."}], + "j.": [{"F": "j."}], + "k.": [{"F": "k."}], + "l.": [{"F": "l."}], + "m.": [{"F": "m."}], + "n.": [{"F": "n."}], + "o.": [{"F": "o."}], + "p.": [{"F": "p."}], + "q.": [{"F": "q."}], + "r.": [{"F": "r."}], + "s.": [{"F": "s."}], + "t.": [{"F": "t."}], + "u.": [{"F": "u."}], + "v.": [{"F": "v."}], + "w.": [{"F": "w."}], + "x.": [{"F": "x."}], + "y.": [{"F": "y."}], + "z.": [{"F": "z."}], +} + + +TAG_MAP = { +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NNE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PROAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"}, +"SP": {"pos": "SPACE"} +} diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py new file mode 100644 index 000000000..4d561910f --- /dev/null +++ b/spacy/fr/__init__.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from . import language_data + + +class French(Language): + lang = 'fr' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) + diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py new file mode 100644 index 000000000..291492957 --- /dev/null +++ b/spacy/fr/language_data.py @@ -0,0 +1,353 @@ +# encoding: utf8 +from __future__ import unicode_literals +import re + + +STOP_WORDS = set() + + +TOKENIZER_PREFIXES = map(re.escape, r''' +, +" +( +[ +{ +* +< +> +$ +£ +„ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... +‚ +» +_ +§ +'''.strip().split('\n')) + + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +“ +« +_ +'' +'s +'S +’s +’S +’ +‘ +° +€ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + + +TOKENIZER_INFIXES = tuple() + + +TOKENIZER_EXCEPTIONS = { + "vs.": [{"F": "vs."}], + + "''": [{"F": "''"}], + "—": [{"F": "—", "L": "--", "pos": ":"}], + + "a.m.": [{"F": "a.m."}], + "p.m.": [{"F": "p.m."}], + + "1a.m.": [{"F": "1"}, {"F": "a.m."}], + "2a.m.": [{"F": "2"}, {"F": "a.m."}], + "3a.m.": [{"F": "3"}, {"F": "a.m."}], + "4a.m.": [{"F": "4"}, {"F": "a.m."}], + "5a.m.": [{"F": "5"}, {"F": "a.m."}], + "6a.m.": [{"F": "6"}, {"F": "a.m."}], + "7a.m.": [{"F": "7"}, {"F": "a.m."}], + "8a.m.": [{"F": "8"}, {"F": "a.m."}], + "9a.m.": [{"F": "9"}, {"F": "a.m."}], + "10a.m.": [{"F": "10"}, {"F": "a.m."}], + "11a.m.": [{"F": "11"}, {"F": "a.m."}], + "12a.m.": [{"F": "12"}, {"F": "a.m."}], + "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], + "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], + "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], + "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], + "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], + "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], + "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], + "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], + "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], + "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], + "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], + "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + "p.m.": [{"F": "p.m."}], + "1p.m.": [{"F": "1"}, {"F": "p.m."}], + "2p.m.": [{"F": "2"}, {"F": "p.m."}], + "3p.m.": [{"F": "3"}, {"F": "p.m."}], + "4p.m.": [{"F": "4"}, {"F": "p.m."}], + "5p.m.": [{"F": "5"}, {"F": "p.m."}], + "6p.m.": [{"F": "6"}, {"F": "p.m."}], + "7p.m.": [{"F": "7"}, {"F": "p.m."}], + "8p.m.": [{"F": "8"}, {"F": "p.m."}], + "9p.m.": [{"F": "9"}, {"F": "p.m."}], + "10p.m.": [{"F": "10"}, {"F": "p.m."}], + "11p.m.": [{"F": "11"}, {"F": "p.m."}], + "12p.m.": [{"F": "12"}, {"F": "p.m."}], + "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], + "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], + "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], + "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], + "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], + "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], + "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], + "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], + "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], + "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], + "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], + "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + + "Ala.": [{"F": "Ala."}], + "Ariz.": [{"F": "Ariz."}], + "Ark.": [{"F": "Ark."}], + "Calif.": [{"F": "Calif."}], + "Colo.": [{"F": "Colo."}], + "Conn.": [{"F": "Conn."}], + "Del.": [{"F": "Del."}], + "D.C.": [{"F": "D.C."}], + "Fla.": [{"F": "Fla."}], + "Ga.": [{"F": "Ga."}], + "Ill.": [{"F": "Ill."}], + "Ind.": [{"F": "Ind."}], + "Kans.": [{"F": "Kans."}], + "Kan.": [{"F": "Kan."}], + "Ky.": [{"F": "Ky."}], + "La.": [{"F": "La."}], + "Md.": [{"F": "Md."}], + "Mass.": [{"F": "Mass."}], + "Mich.": [{"F": "Mich."}], + "Minn.": [{"F": "Minn."}], + "Miss.": [{"F": "Miss."}], + "Mo.": [{"F": "Mo."}], + "Mont.": [{"F": "Mont."}], + "Nebr.": [{"F": "Nebr."}], + "Neb.": [{"F": "Neb."}], + "Nev.": [{"F": "Nev."}], + "N.H.": [{"F": "N.H."}], + "N.J.": [{"F": "N.J."}], + "N.M.": [{"F": "N.M."}], + "N.Y.": [{"F": "N.Y."}], + "N.C.": [{"F": "N.C."}], + "N.D.": [{"F": "N.D."}], + "Okla.": [{"F": "Okla."}], + "Ore.": [{"F": "Ore."}], + "Pa.": [{"F": "Pa."}], + "Tenn.": [{"F": "Tenn."}], + "Va.": [{"F": "Va."}], + "Wash.": [{"F": "Wash."}], + "Wis.": [{"F": "Wis."}], + + ":)": [{"F": ":)"}], + "<3": [{"F": "<3"}], + ";)": [{"F": ";)"}], + "(:": [{"F": "(:"}], + ":(": [{"F": ":("}], + "-_-": [{"F": "-_-"}], + "=)": [{"F": "=)"}], + ":/": [{"F": ":/"}], + ":>": [{"F": ":>"}], + ";-)": [{"F": ";-)"}], + ":Y": [{"F": ":Y"}], + ":P": [{"F": ":P"}], + ":-P": [{"F": ":-P"}], + ":3": [{"F": ":3"}], + "=3": [{"F": "=3"}], + "xD": [{"F": "xD"}], + "^_^": [{"F": "^_^"}], + "=]": [{"F": "=]"}], + "=D": [{"F": "=D"}], + "<333": [{"F": "<333"}], + ":))": [{"F": ":))"}], + ":0": [{"F": ":0"}], + "-__-": [{"F": "-__-"}], + "xDD": [{"F": "xDD"}], + "o_o": [{"F": "o_o"}], + "o_O": [{"F": "o_O"}], + "V_V": [{"F": "V_V"}], + "=[[": [{"F": "=[["}], + "<33": [{"F": "<33"}], + ";p": [{"F": ";p"}], + ";D": [{"F": ";D"}], + ";-p": [{"F": ";-p"}], + ";(": [{"F": ";("}], + ":p": [{"F": ":p"}], + ":]": [{"F": ":]"}], + ":O": [{"F": ":O"}], + ":-/": [{"F": ":-/"}], + ":-)": [{"F": ":-)"}], + ":(((": [{"F": ":((("}], + ":((": [{"F": ":(("}], + ":')": [{"F": ":')"}], + "(^_^)": [{"F": "(^_^)"}], + "(=": [{"F": "(="}], + "o.O": [{"F": "o.O"}], + "\")": [{"F": "\")"}], + + "a.": [{"F": "a."}], + "b.": [{"F": "b."}], + "c.": [{"F": "c."}], + "d.": [{"F": "d."}], + "e.": [{"F": "e."}], + "f.": [{"F": "f."}], + "g.": [{"F": "g."}], + "h.": [{"F": "h."}], + "i.": [{"F": "i."}], + "j.": [{"F": "j."}], + "k.": [{"F": "k."}], + "l.": [{"F": "l."}], + "m.": [{"F": "m."}], + "n.": [{"F": "n."}], + "o.": [{"F": "o."}], + "p.": [{"F": "p."}], + "q.": [{"F": "q."}], + "r.": [{"F": "r."}], + "s.": [{"F": "s."}], + "t.": [{"F": "t."}], + "u.": [{"F": "u."}], + "v.": [{"F": "v."}], + "w.": [{"F": "w."}], + "x.": [{"F": "x."}], + "y.": [{"F": "y."}], + "z.": [{"F": "z."}], +} + + +TAG_MAP = { +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NNE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PROAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"}, +"SP": {"pos": "SPACE"} +} diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py new file mode 100644 index 000000000..a991ef7ae --- /dev/null +++ b/spacy/pt/__init__.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from . import language_data + + +class Portuguese(Language): + lang = 'pt' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'pt' + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) + diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py new file mode 100644 index 000000000..291492957 --- /dev/null +++ b/spacy/pt/language_data.py @@ -0,0 +1,353 @@ +# encoding: utf8 +from __future__ import unicode_literals +import re + + +STOP_WORDS = set() + + +TOKENIZER_PREFIXES = map(re.escape, r''' +, +" +( +[ +{ +* +< +> +$ +£ +„ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... +‚ +» +_ +§ +'''.strip().split('\n')) + + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +“ +« +_ +'' +'s +'S +’s +’S +’ +‘ +° +€ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + + +TOKENIZER_INFIXES = tuple() + + +TOKENIZER_EXCEPTIONS = { + "vs.": [{"F": "vs."}], + + "''": [{"F": "''"}], + "—": [{"F": "—", "L": "--", "pos": ":"}], + + "a.m.": [{"F": "a.m."}], + "p.m.": [{"F": "p.m."}], + + "1a.m.": [{"F": "1"}, {"F": "a.m."}], + "2a.m.": [{"F": "2"}, {"F": "a.m."}], + "3a.m.": [{"F": "3"}, {"F": "a.m."}], + "4a.m.": [{"F": "4"}, {"F": "a.m."}], + "5a.m.": [{"F": "5"}, {"F": "a.m."}], + "6a.m.": [{"F": "6"}, {"F": "a.m."}], + "7a.m.": [{"F": "7"}, {"F": "a.m."}], + "8a.m.": [{"F": "8"}, {"F": "a.m."}], + "9a.m.": [{"F": "9"}, {"F": "a.m."}], + "10a.m.": [{"F": "10"}, {"F": "a.m."}], + "11a.m.": [{"F": "11"}, {"F": "a.m."}], + "12a.m.": [{"F": "12"}, {"F": "a.m."}], + "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], + "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], + "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], + "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], + "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], + "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], + "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], + "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], + "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], + "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], + "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], + "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + "p.m.": [{"F": "p.m."}], + "1p.m.": [{"F": "1"}, {"F": "p.m."}], + "2p.m.": [{"F": "2"}, {"F": "p.m."}], + "3p.m.": [{"F": "3"}, {"F": "p.m."}], + "4p.m.": [{"F": "4"}, {"F": "p.m."}], + "5p.m.": [{"F": "5"}, {"F": "p.m."}], + "6p.m.": [{"F": "6"}, {"F": "p.m."}], + "7p.m.": [{"F": "7"}, {"F": "p.m."}], + "8p.m.": [{"F": "8"}, {"F": "p.m."}], + "9p.m.": [{"F": "9"}, {"F": "p.m."}], + "10p.m.": [{"F": "10"}, {"F": "p.m."}], + "11p.m.": [{"F": "11"}, {"F": "p.m."}], + "12p.m.": [{"F": "12"}, {"F": "p.m."}], + "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], + "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], + "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], + "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], + "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], + "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], + "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], + "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], + "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], + "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], + "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], + "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + + "Ala.": [{"F": "Ala."}], + "Ariz.": [{"F": "Ariz."}], + "Ark.": [{"F": "Ark."}], + "Calif.": [{"F": "Calif."}], + "Colo.": [{"F": "Colo."}], + "Conn.": [{"F": "Conn."}], + "Del.": [{"F": "Del."}], + "D.C.": [{"F": "D.C."}], + "Fla.": [{"F": "Fla."}], + "Ga.": [{"F": "Ga."}], + "Ill.": [{"F": "Ill."}], + "Ind.": [{"F": "Ind."}], + "Kans.": [{"F": "Kans."}], + "Kan.": [{"F": "Kan."}], + "Ky.": [{"F": "Ky."}], + "La.": [{"F": "La."}], + "Md.": [{"F": "Md."}], + "Mass.": [{"F": "Mass."}], + "Mich.": [{"F": "Mich."}], + "Minn.": [{"F": "Minn."}], + "Miss.": [{"F": "Miss."}], + "Mo.": [{"F": "Mo."}], + "Mont.": [{"F": "Mont."}], + "Nebr.": [{"F": "Nebr."}], + "Neb.": [{"F": "Neb."}], + "Nev.": [{"F": "Nev."}], + "N.H.": [{"F": "N.H."}], + "N.J.": [{"F": "N.J."}], + "N.M.": [{"F": "N.M."}], + "N.Y.": [{"F": "N.Y."}], + "N.C.": [{"F": "N.C."}], + "N.D.": [{"F": "N.D."}], + "Okla.": [{"F": "Okla."}], + "Ore.": [{"F": "Ore."}], + "Pa.": [{"F": "Pa."}], + "Tenn.": [{"F": "Tenn."}], + "Va.": [{"F": "Va."}], + "Wash.": [{"F": "Wash."}], + "Wis.": [{"F": "Wis."}], + + ":)": [{"F": ":)"}], + "<3": [{"F": "<3"}], + ";)": [{"F": ";)"}], + "(:": [{"F": "(:"}], + ":(": [{"F": ":("}], + "-_-": [{"F": "-_-"}], + "=)": [{"F": "=)"}], + ":/": [{"F": ":/"}], + ":>": [{"F": ":>"}], + ";-)": [{"F": ";-)"}], + ":Y": [{"F": ":Y"}], + ":P": [{"F": ":P"}], + ":-P": [{"F": ":-P"}], + ":3": [{"F": ":3"}], + "=3": [{"F": "=3"}], + "xD": [{"F": "xD"}], + "^_^": [{"F": "^_^"}], + "=]": [{"F": "=]"}], + "=D": [{"F": "=D"}], + "<333": [{"F": "<333"}], + ":))": [{"F": ":))"}], + ":0": [{"F": ":0"}], + "-__-": [{"F": "-__-"}], + "xDD": [{"F": "xDD"}], + "o_o": [{"F": "o_o"}], + "o_O": [{"F": "o_O"}], + "V_V": [{"F": "V_V"}], + "=[[": [{"F": "=[["}], + "<33": [{"F": "<33"}], + ";p": [{"F": ";p"}], + ";D": [{"F": ";D"}], + ";-p": [{"F": ";-p"}], + ";(": [{"F": ";("}], + ":p": [{"F": ":p"}], + ":]": [{"F": ":]"}], + ":O": [{"F": ":O"}], + ":-/": [{"F": ":-/"}], + ":-)": [{"F": ":-)"}], + ":(((": [{"F": ":((("}], + ":((": [{"F": ":(("}], + ":')": [{"F": ":')"}], + "(^_^)": [{"F": "(^_^)"}], + "(=": [{"F": "(="}], + "o.O": [{"F": "o.O"}], + "\")": [{"F": "\")"}], + + "a.": [{"F": "a."}], + "b.": [{"F": "b."}], + "c.": [{"F": "c."}], + "d.": [{"F": "d."}], + "e.": [{"F": "e."}], + "f.": [{"F": "f."}], + "g.": [{"F": "g."}], + "h.": [{"F": "h."}], + "i.": [{"F": "i."}], + "j.": [{"F": "j."}], + "k.": [{"F": "k."}], + "l.": [{"F": "l."}], + "m.": [{"F": "m."}], + "n.": [{"F": "n."}], + "o.": [{"F": "o."}], + "p.": [{"F": "p."}], + "q.": [{"F": "q."}], + "r.": [{"F": "r."}], + "s.": [{"F": "s."}], + "t.": [{"F": "t."}], + "u.": [{"F": "u."}], + "v.": [{"F": "v."}], + "w.": [{"F": "w."}], + "x.": [{"F": "x."}], + "y.": [{"F": "y."}], + "z.": [{"F": "z."}], +} + + +TAG_MAP = { +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NNE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PROAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"}, +"SP": {"pos": "SPACE"} +}