Merge language data change

This commit is contained in:
Matthew Honnibal 2016-12-18 17:03:52 +01:00
commit 7a98ee5e5a
45 changed files with 4200 additions and 4322 deletions

View File

@ -1,229 +0,0 @@
"""Set up a model directory.
Requires:
lang_data --- Rules for the tokenizer
* prefix.txt
* suffix.txt
* infix.txt
* morphs.json
* specials.json
corpora --- Data files
* WordNet
* words.sgt.prob --- Smoothed unigram probabilities
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
* vectors.bz2 --- output of something like word2vec, compressed with bzip
"""
from __future__ import unicode_literals
from ast import literal_eval
import math
import gzip
import json
import plac
from pathlib import Path
from shutil import copyfile
from shutil import copytree
from collections import defaultdict
import io
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.strings import hash_string
from preshed.counter import PreshCounter
from spacy.parts_of_speech import NOUN, VERB, ADJ
from spacy.util import get_lang_class
try:
unicode
except NameError:
unicode = str
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
tok_dir.mkdir()
for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json',
'suffix.txt'):
src = lang_data_dir / filename
dst = tok_dir / filename
copyfile(str(src), str(dst))
def _read_clusters(loc):
if not loc.exists():
print("Warning: Clusters file not found")
return {}
clusters = {}
for line in io.open(str(loc), 'r', encoding='utf8'):
try:
cluster, word, freq = line.split()
except ValueError:
continue
# If the clusterer has only seen the word a few times, its cluster is
# unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def _read_probs(loc):
if not loc.exists():
print("Probabilities file not found. Trying freqs.")
return {}, 0.0
probs = {}
for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')):
prob, word = line.split()
prob = float(prob)
probs[word] = prob
return probs, probs['-OOV-']
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
if not loc.exists():
print("Warning: Frequencies file not found")
return {}, 0.0
counts = PreshCounter()
total = 0
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
for i, line in enumerate(file_):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
if str(loc).endswith('gz'):
file_ = gzip.open(str(loc))
else:
file_ = loc.open()
probs = {}
for line in file_:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
if not loc.exists():
print("Warning: WordNet senses not found")
return lexicon
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
for line in codecs.open(str(loc), 'r', 'utf8'):
sense_strings = line.split()
word = sense_strings.pop(0)
for sense in sense_strings:
pos, sense = sense[3:].split('.')
sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
if sense_name != 'N_tops':
sense_id = sense_names[sense_name]
lexicon[word][pos_ids[pos]].append(sense_id)
return lexicon
def setup_vocab(lex_attr_getters, tag_map, src_dir, dst_dir):
if not dst_dir.exists():
dst_dir.mkdir()
print('Reading vocab from ', src_dir)
vectors_src = src_dir / 'vectors.bz2'
if vectors_src.exists():
write_binary_vectors(vectors_src.as_posix(), (dst_dir / 'vec.bin').as_posix())
else:
print("Warning: Word vectors file not found")
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
if not probs:
oov_prob = -20
else:
oov_prob = min(probs.values())
for word in clusters:
if word not in probs:
probs[word] = oov_prob
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
# First encode the strings into the StringStore. This way, we can map
# the orth IDs to frequency ranks
orth = vocab.strings[word]
# Now actually load the vocab
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
vocab.dump((dst_dir / 'lexemes.bin').as_posix())
with (dst_dir / 'strings.json').open('w') as file_:
vocab.strings.dump(file_)
with (dst_dir / 'oov_prob').open('w') as file_:
file_.write('%f' % oov_prob)
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir) / lang_id
corpora_dir = Path(corpora_dir) / lang_id
assert corpora_dir.exists()
assert lang_data_dir.exists()
if not model_dir.exists():
model_dir.mkdir()
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(get_lang_class(lang_id).Defaults.lex_attr_getters, tag_map, corpora_dir,
model_dir / 'vocab')
if (lang_data_dir / 'gazetteer.json').exists():
copyfile((lang_data_dir / 'gazetteer.json').as_posix(),
(model_dir / 'vocab' / 'gazetteer.json').as_posix())
copyfile((lang_data_dir / 'tag_map.json').as_posix(),
(model_dir / 'vocab' / 'tag_map.json').as_posix())
if (lang_data_dir / 'lemma_rules.json').exists():
copyfile((lang_data_dir / 'lemma_rules.json').as_posix(),
(model_dir / 'vocab' / 'lemma_rules.json').as_posix())
if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
copytree((corpora_dir / 'wordnet' / 'dict').as_posix(),
(model_dir / 'wordnet').as_posix())
if __name__ == '__main__':
plac.call(main)

163
fabfile.py vendored
View File

@ -13,134 +13,6 @@ PWD = path.dirname(__file__)
VENV_DIR = path.join(PWD, '.env')
def counts():
pass
# Tokenize the corpus
# tokenize()
# get_freqs()
# Collate the counts
# cat freqs | sort -k2 | gather_freqs()
# gather_freqs()
# smooth()
# clean, make, sdist
# cd to new env, install from sdist,
# Push changes to server
# Pull changes on server
# clean make init model
# test --vectors --slow
# train
# test --vectors --slow --models
# sdist
# upload data to server
# change to clean venv
# py2: install from sdist, test --slow, download data, test --models --vectors
# py3: install from sdist, test --slow, download data, test --models --vectors
def prebuild(build_dir='/tmp/build_spacy'):
if file_exists(build_dir):
shutil.rmtree(build_dir)
os.mkdir(build_dir)
spacy_dir = path.dirname(__file__)
wn_url = 'http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz'
build_venv = path.join(build_dir, '.env')
with lcd(build_dir):
local('git clone %s .' % spacy_dir)
local('virtualenv ' + build_venv)
with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)):
local('pip install cython fabric fabtools pytest')
local('pip install --no-cache-dir -r requirements.txt')
local('fab clean make')
local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir)
local('PYTHONPATH=`pwd` python bin/init_model.py en lang_data corpora spacy/en/data')
local('PYTHONPATH=`pwd` fab test')
local('PYTHONPATH=`pwd` python -m spacy.en.download --force all')
local('PYTHONPATH=`pwd` py.test --models spacy/tests/')
def web():
def jade(source_name, out_dir):
pwd = path.join(path.dirname(__file__), 'website')
jade_loc = path.join(pwd, 'src', 'jade', source_name)
out_loc = path.join(pwd, 'site', out_dir)
local('jade -P %s --out %s' % (jade_loc, out_loc))
with virtualenv(VENV_DIR):
local('./website/create_code_samples spacy/tests/website/ website/src/code/')
jade('404.jade', '')
jade('home/index.jade', '')
jade('docs/index.jade', 'docs/')
jade('blog/index.jade', 'blog/')
for collection in ('blog', 'tutorials'):
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
if post_dir.is_dir() \
and (post_dir / 'index.jade').exists() \
and (post_dir / 'meta.jade').exists():
jade(str(post_dir / 'index.jade'), path.join(collection, post_dir.parts[-1]))
def web_publish(assets_path):
from boto.s3.connection import S3Connection, OrdinaryCallingFormat
site_path = 'website/site'
os.environ['S3_USE_SIGV4'] = 'True'
conn = S3Connection(host='s3.eu-central-1.amazonaws.com',
calling_format=OrdinaryCallingFormat())
bucket = conn.get_bucket('spacy.io', validate=False)
keys_left = set([k.name for k in bucket.list()
if not k.name.startswith('resources')])
for root, dirnames, filenames in os.walk(site_path):
for dirname in dirnames:
target = os.path.relpath(os.path.join(root, dirname), site_path)
source = os.path.join(target, 'index.html')
if os.path.exists(os.path.join(root, dirname, 'index.html')):
key = bucket.new_key(source)
key.set_redirect('//%s/%s' % (bucket.name, target))
print('adding redirect for %s' % target)
keys_left.remove(source)
for filename in filenames:
source = os.path.join(root, filename)
target = os.path.relpath(root, site_path)
if target == '.':
target = filename
elif filename != 'index.html':
target = os.path.join(target, filename)
key = bucket.new_key(target)
key.set_metadata('Content-Type', 'text/html')
key.set_contents_from_filename(source)
print('uploading %s' % target)
keys_left.remove(target)
for key_name in keys_left:
print('deleting %s' % key_name)
bucket.delete_key(key_name)
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
def publish(version):
with virtualenv(VENV_DIR):
local('git push origin master')
local('git tag -a %s' % version)
local('git push origin %s' % version)
local('python setup.py sdist')
local('python setup.py register')
local('twine upload dist/spacy-%s.tar.gz' % version)
def env(lang="python2.7"):
if file_exists('.env'):
local('rm -rf .env')
@ -172,38 +44,3 @@ def test():
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)):
local('py.test -x spacy/tests')
def train(json_dir=None, dev_loc=None, model_dir=None):
if json_dir is None:
json_dir = 'corpora/en/json'
if model_dir is None:
model_dir = 'models/en/'
with virtualenv(VENV_DIR):
with lcd(path.dirname(__file__)):
local('python bin/init_model.py en lang_data/ corpora/ ' + model_dir)
local('python bin/parser/train.py -p en %s/train/ %s/development %s' % (json_dir, json_dir, model_dir))
def travis():
local('open https://travis-ci.org/honnibal/thinc')
def pos():
with virtualenv(VENV_DIR):
local('python tools/train.py ~/work_data/docparse/wsj02-21.conll ~/work_data/docparse/wsj22.conll spacy/en/data')
local('python tools/tag.py ~/work_data/docparse/wsj22.raw /tmp/tmp')
local('python tools/eval_pos.py ~/work_data/docparse/wsj22.conll /tmp/tmp')
def ner():
local('rm -rf data/en/ner')
local('python tools/train_ner.py ~/work_data/docparse/wsj02-21.conll data/en/ner')
local('python tools/tag_ner.py ~/work_data/docparse/wsj22.raw /tmp/tmp')
local('python tools/eval_ner.py ~/work_data/docparse/wsj22.conll /tmp/tmp | tail')
def conll():
local('rm -rf data/en/ner')
local('python tools/conll03_train.py ~/work_data/ner/conll2003/eng.train data/en/ner/')
local('python tools/conll03_eval.py ~/work_data/ner/conll2003/eng.testa')

View File

@ -87,5 +87,3 @@ cpdef enum attr_id_t:
PROB
LANG

View File

@ -120,8 +120,14 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
stringy_attrs.pop('number')
if 'tenspect' in stringy_attrs:
stringy_attrs.pop('tenspect')
# for name, value in morphs.items():
# stringy_attrs[name] = value
morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Number', 'PronType', 'AdjType', 'Person']
for key in morph_keys:
if key in stringy_attrs:
stringy_attrs.pop(key)
for name, value in stringy_attrs.items():
if isinstance(name, int):
int_key = name

View File

@ -5,25 +5,8 @@ from os import path
from ..language import Language
from ..attrs import LANG
from . import language_data
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class German(Language):
@ -35,8 +18,5 @@ class German(Language):
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,772 +1,21 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TAG_MAP = {
"$(": {TAG: PUNCT, "PunctType": "brck"},
"$,": {TAG: PUNCT, "PunctType": "comm"},
"$.": {TAG: PUNCT, "PunctType": "peri"},
"ADJA": {TAG: ADJ},
"ADJD": {TAG: ADJ, "Variant": "short"},
"ADV": {TAG: ADV},
"APPO": {TAG: ADP, "AdpType": "post"},
"APPR": {TAG: ADP, "AdpType": "prep"},
"APPRART": {TAG: ADP, "AdpType": "prep", "PronType": "art"},
"APZR": {TAG: ADP, "AdpType": "circ"},
"ART": {TAG: DET, "PronType": "art"},
"CARD": {TAG: NUM, "NumType": "card"},
"FM": {TAG: X, "Foreign": "yes"},
"ITJ": {TAG: INTJ},
"KOKOM": {TAG: CONJ, "ConjType": "comp"},
"KON": {TAG: CONJ},
"KOUI": {TAG: SCONJ},
"KOUS": {TAG: SCONJ},
"NE": {TAG: PROPN},
"NNE": {TAG: PROPN},
"NN": {TAG: NOUN},
"PAV": {TAG: ADV, "PronType": "dem"},
"PROAV": {TAG: ADV, "PronType": "dem"},
"PDAT": {TAG: DET, "PronType": "dem"},
"PDS": {TAG: PRON, "PronType": "dem"},
"PIAT": {TAG: DET, "PronType": "ind|neg|tot"},
"PIDAT": {TAG: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
"PIS": {TAG: PRON, "PronType": "ind|neg|tot"},
"PPER": {TAG: PRON, "PronType": "prs"},
"PPOSAT": {TAG: DET, "Poss": "yes", "PronType": "prs"},
"PPOSS": {TAG: PRON, "Poss": "yes", "PronType": "prs"},
"PRELAT": {TAG: DET, "PronType": "rel"},
"PRELS": {TAG: PRON, "PronType": "rel"},
"PRF": {TAG: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {TAG: PART},
"PTKANT": {TAG: PART, "PartType": "res"},
"PTKNEG": {TAG: PART, "Negative": "yes"},
"PTKVZ": {TAG: PART, "PartType": "vbp"},
"PTKZU": {TAG: PART, "PartType": "inf"},
"PWAT": {TAG: DET, "PronType": "int"},
"PWAV": {TAG: ADV, "PronType": "int"},
"PWS": {TAG: PRON, "PronType": "int"},
"TRUNC": {TAG: X, "Hyph": "yes"},
"VAFIN": {TAG: AUX, "Mood": "ind", "VerbForm": "fin"},
"VAIMP": {TAG: AUX, "Mood": "imp", "VerbForm": "fin"},
"VAINF": {TAG: AUX, "VerbForm": "inf"},
"VAPP": {TAG: AUX, "Aspect": "perf", "VerbForm": "part"},
"VMFIN": {TAG: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"},
"VMINF": {TAG: VERB, "VerbForm": "inf", "VerbType": "mod"},
"VMPP": {TAG: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"},
"VVFIN": {TAG: VERB, "Mood": "ind", "VerbForm": "fin"},
"VVIMP": {TAG: VERB, "Mood": "imp", "VerbForm": "fin"},
"VVINF": {TAG: VERB, "VerbForm": "inf"},
"VVIZU": {TAG: VERB, "VerbForm": "inf"},
"VVPP": {TAG: VERB, "Aspect": "perf", "VerbForm": "part"},
"XY": {TAG: X},
"SP": {TAG: SPACE}
}
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
TAG_MAP = dict(TAG_MAP)
STOP_WORDS = set(STOP_WORDS)
STOP_WORDS = set("""
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
aller allerdings alles allgemeinen als also am an andere anderen andern anders
auch auf aus ausser außer ausserdem außerdem
bald bei beide beiden beim beispiel bekannt bereits besonders besser besten bin
bis bisher bist
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
da dabei dadurch dafür dagegen daher dahin dahinter damals damit danach daneben
dank dann daran darauf daraus darf darfst darin darüber darum darunter das
dasein daselbst dass daß dasselbe davon davor dazu dazwischen dein deine deinem
deiner dem dementsprechend demgegenüber demgemäss demgemäß demselben demzufolge
den denen denn denselben der deren derjenige derjenigen dermassen dermaßen
derselbe derselben des deshalb desselben dessen deswegen dich die diejenige
diejenigen dies diese dieselbe dieselben diesem diesen dieser dieses dir doch
dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft
durfte durften
eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine
einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en
ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch
früher fünf fünfte fünften fünfter fünftes für
gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen
geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige
gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen
großen grosser großer grosses großes gut gute guter gutes
habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier
hin hinter hoch
ich ihm ihn ihnen ihr ihre ihrem ihrer ihres im immer in indem infolgedessen
ins irgend ist
ja jahr jahre jahren je jede jedem jeden jeder jedermann jedermanns jedoch
jemand jemandem jemanden jene jenem jenen jener jenes jetzt
kam kann kannst kaum kein keine keinem keinen keiner kleine kleinen kleiner
kleines kommen kommt können könnt konnte könnte konnten kurz
lang lange leicht leider lieber los
machen macht machte mag magst man manche manchem manchen mancher manches mehr
mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel
mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst
musste mussten
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
neuntes nicht nichts nie niemand niemandem niemanden noch nun nur
ob oben oder offen oft ohne
recht rechte rechten rechter rechtes richtig rund
sagt sagte sah satt schlecht schon sechs sechste sechsten sechster sechstes
sehr sei seid seien sein seine seinem seinen seiner seines seit seitdem selbst
selbst sich sie sieben siebente siebenten siebenter siebentes siebte siebten
siebter siebtes sind so solang solche solchem solchen solcher solches soll
sollen sollte sollten sondern sonst sowie später statt
tag tage tagen tat teil tel trotzdem tun
über überhaupt übrigens uhr um und uns unser unsere unserer unter
vergangene vergangenen viel viele vielem vielen vielleicht vier vierte vierten
vierter viertes vom von vor
wahr während währenddem währenddessen wann war wäre waren wart warum was wegen
weil weit weiter weitere weiteren weiteres welche welchem welchen welcher
welches wem wen wenig wenige weniger weniges wenigstens wenn wer werde werden
werdet wessen wie wieder will willst wir wird wirklich wirst wo wohl wollen
wollt wollte wollten worden wurde würde wurden würden
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
""".split())
TOKENIZER_EXCEPTIONS = {
"\\n": [
{ORTH: "\\n", LEMMA: "<nl>", TAG: "SP"}
],
"\\t": [
{ORTH: "\\t", LEMMA: "<tab>", TAG: "SP"}
],
"'S": [
{ORTH: "'S", LEMMA: PRON_LEMMA}
],
"'n": [
{ORTH: "'n", LEMMA: "ein"}
],
"'ne": [
{ORTH: "'ne", LEMMA: "eine"}
],
"'nen": [
{ORTH: "'nen", LEMMA: "einen"}
],
"'s": [
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"Abb.": [
{ORTH: "Abb.", LEMMA: "Abbildung"}
],
"Abk.": [
{ORTH: "Abk.", LEMMA: "Abkürzung"}
],
"Abt.": [
{ORTH: "Abt.", LEMMA: "Abteilung"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "August"}
],
"Bd.": [
{ORTH: "Bd.", LEMMA: "Band"}
],
"Betr.": [
{ORTH: "Betr.", LEMMA: "Betreff"}
],
"Bf.": [
{ORTH: "Bf.", LEMMA: "Bahnhof"}
],
"Bhf.": [
{ORTH: "Bhf.", LEMMA: "Bahnhof"}
],
"Bsp.": [
{ORTH: "Bsp.", LEMMA: "Beispiel"}
],
"Dez.": [
{ORTH: "Dez.", LEMMA: "Dezember"}
],
"Di.": [
{ORTH: "Di.", LEMMA: "Dienstag"}
],
"Do.": [
{ORTH: "Do.", LEMMA: "Donnerstag"}
],
"Fa.": [
{ORTH: "Fa.", LEMMA: "Firma"}
],
"Fam.": [
{ORTH: "Fam.", LEMMA: "Familie"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "Februar"}
],
"Fr.": [
{ORTH: "Fr.", LEMMA: "Frau"}
],
"Frl.": [
{ORTH: "Frl.", LEMMA: "Fräulein"}
],
"Hbf.": [
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}
],
"Hr.": [
{ORTH: "Hr.", LEMMA: "Herr"}
],
"Hrn.": [
{ORTH: "Hrn.", LEMMA: "Herr"}
],
"Jan.": [
{ORTH: "Jan.", LEMMA: "Januar"}
],
"Jh.": [
{ORTH: "Jh.", LEMMA: "Jahrhundert"}
],
"Jhd.": [
{ORTH: "Jhd.", LEMMA: "Jahrhundert"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "Juli"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "Juni"}
],
"Mi.": [
{ORTH: "Mi.", LEMMA: "Mittwoch"}
],
"Mio.": [
{ORTH: "Mio.", LEMMA: "Million"}
],
"Mo.": [
{ORTH: "Mo.", LEMMA: "Montag"}
],
"Mrd.": [
{ORTH: "Mrd.", LEMMA: "Milliarde"}
],
"Mrz.": [
{ORTH: "Mrz.", LEMMA: "März"}
],
"MwSt.": [
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}
],
"Mär.": [
{ORTH: "Mär.", LEMMA: "März"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Nr.": [
{ORTH: "Nr.", LEMMA: "Nummer"}
],
"Okt.": [
{ORTH: "Okt.", LEMMA: "Oktober"}
],
"Orig.": [
{ORTH: "Orig.", LEMMA: "Original"}
],
"Pkt.": [
{ORTH: "Pkt.", LEMMA: "Punkt"}
],
"Prof.": [
{ORTH: "Prof.", LEMMA: "Professor"}
],
"Red.": [
{ORTH: "Red.", LEMMA: "Redaktion"}
],
"S'": [
{ORTH: "S'", LEMMA: PRON_LEMMA}
],
"Sa.": [
{ORTH: "Sa.", LEMMA: "Samstag"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"So.": [
{ORTH: "So.", LEMMA: "Sonntag"}
],
"Std.": [
{ORTH: "Std.", LEMMA: "Stunde"}
],
"Str.": [
{ORTH: "Str.", LEMMA: "Straße"}
],
"Tel.": [
{ORTH: "Tel.", LEMMA: "Telefon"}
],
"Tsd.": [
{ORTH: "Tsd.", LEMMA: "Tausend"}
],
"Univ.": [
{ORTH: "Univ.", LEMMA: "Universität"}
],
"abzgl.": [
{ORTH: "abzgl.", LEMMA: "abzüglich"}
],
"allg.": [
{ORTH: "allg.", LEMMA: "allgemein"}
],
"auf'm": [
{ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"bspw.": [
{ORTH: "bspw.", LEMMA: "beispielsweise"}
],
"bzgl.": [
{ORTH: "bzgl.", LEMMA: "bezüglich"}
],
"bzw.": [
{ORTH: "bzw.", LEMMA: "beziehungsweise"}
],
"d.h.": [
{ORTH: "d.h.", LEMMA: "das heißt"}
],
"dgl.": [
{ORTH: "dgl.", LEMMA: "dergleichen"}
],
"du's": [
{ORTH: "du", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"ebd.": [
{ORTH: "ebd.", LEMMA: "ebenda"}
],
"eigtl.": [
{ORTH: "eigtl.", LEMMA: "eigentlich"}
],
"engl.": [
{ORTH: "engl.", LEMMA: "englisch"}
],
"er's": [
{ORTH: "er", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"evtl.": [
{ORTH: "evtl.", LEMMA: "eventuell"}
],
"frz.": [
{ORTH: "frz.", LEMMA: "französisch"}
],
"gegr.": [
{ORTH: "gegr.", LEMMA: "gegründet"}
],
"ggf.": [
{ORTH: "ggf.", LEMMA: "gegebenenfalls"}
],
"ggfs.": [
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"}
],
"ggü.": [
{ORTH: "ggü.", LEMMA: "gegenüber"}
],
"hinter'm": [
{ORTH: "hinter", LEMMA: "hinter"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"i.O.": [
{ORTH: "i.O.", LEMMA: "in Ordnung"}
],
"i.d.R.": [
{ORTH: "i.d.R.", LEMMA: "in der Regel"}
],
"ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"incl.": [
{ORTH: "incl.", LEMMA: "inklusive"}
],
"inkl.": [
{ORTH: "inkl.", LEMMA: "inklusive"}
],
"insb.": [
{ORTH: "insb.", LEMMA: "insbesondere"}
],
"kath.": [
{ORTH: "kath.", LEMMA: "katholisch"}
],
"lt.": [
{ORTH: "lt.", LEMMA: "laut"}
],
"max.": [
{ORTH: "max.", LEMMA: "maximal"}
],
"min.": [
{ORTH: "min.", LEMMA: "minimal"}
],
"mind.": [
{ORTH: "mind.", LEMMA: "mindestens"}
],
"mtl.": [
{ORTH: "mtl.", LEMMA: "monatlich"}
],
"n.Chr.": [
{ORTH: "n.Chr.", LEMMA: "nach Christus"}
],
"orig.": [
{ORTH: "orig.", LEMMA: "original"}
],
"röm.": [
{ORTH: "röm.", LEMMA: "römisch"}
],
"s'": [
{ORTH: "s'", LEMMA: PRON_LEMMA}
],
"s.o.": [
{ORTH: "s.o.", LEMMA: "siehe oben"}
],
"sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"sog.": [
{ORTH: "sog.", LEMMA: "so genannt"}
],
"stellv.": [
{ORTH: "stellv.", LEMMA: "stellvertretend"}
],
"tägl.": [
{ORTH: "tägl.", LEMMA: "täglich"}
],
"u.U.": [
{ORTH: "u.U.", LEMMA: "unter Umständen"}
],
"u.s.w.": [
{ORTH: "u.s.w.", LEMMA: "und so weiter"}
],
"u.v.m.": [
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}
],
"unter'm": [
{ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"usf.": [
{ORTH: "usf.", LEMMA: "und so fort"}
],
"usw.": [
{ORTH: "usw.", LEMMA: "und so weiter"}
],
"uvm.": [
{ORTH: "uvm.", LEMMA: "und vieles mehr"}
],
"v.Chr.": [
{ORTH: "v.Chr.", LEMMA: "vor Christus"}
],
"v.a.": [
{ORTH: "v.a.", LEMMA: "vor allem"}
],
"v.l.n.r.": [
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}
],
"vgl.": [
{ORTH: "vgl.", LEMMA: "vergleiche"}
],
"vllt.": [
{ORTH: "vllt.", LEMMA: "vielleicht"}
],
"vlt.": [
{ORTH: "vlt.", LEMMA: "vielleicht"}
],
"vor'm": [
{ORTH: "vor", LEMMA: "vor"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"z.B.": [
{ORTH: "z.B.", LEMMA: "zum Beispiel"}
],
"z.Bsp.": [
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}
],
"z.T.": [
{ORTH: "z.T.", LEMMA: "zum Teil"}
],
"z.Z.": [
{ORTH: "z.Z.", LEMMA: "zur Zeit"}
],
"z.Zt.": [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}
],
"z.b.": [
{ORTH: "z.b.", LEMMA: "zum Beispiel"}
],
"zzgl.": [
{ORTH: "zzgl.", LEMMA: "zuzüglich"}
],
"österr.": [
{ORTH: "österr.", LEMMA: "österreichisch"}
],
"über'm": [
{ORTH: "über", LEMMA: "über"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
]
}
ORTH_ONLY = [
"'",
"\\\")",
"<space>",
"a.",
"ä.",
"A.C.",
"a.D.",
"A.D.",
"A.G.",
"a.M.",
"a.Z.",
"Abs.",
"adv.",
"al.",
"b.",
"B.A.",
"B.Sc.",
"betr.",
"biol.",
"Biol.",
"c.",
"ca.",
"Chr.",
"Cie.",
"co.",
"Co.",
"d.",
"D.C.",
"Dipl.-Ing.",
"Dipl.",
"Dr.",
"e.",
"e.g.",
"e.V.",
"ehem.",
"entspr.",
"erm.",
"etc.",
"ev.",
"f.",
"g.",
"G.m.b.H.",
"geb.",
"Gebr.",
"gem.",
"h.",
"h.c.",
"Hg.",
"hrsg.",
"Hrsg.",
"i.",
"i.A.",
"i.e.",
"i.G.",
"i.Tr.",
"i.V.",
"Ing.",
"j.",
"jr.",
"Jr.",
"jun.",
"jur.",
"k.",
"K.O.",
"l.",
"L.A.",
"lat.",
"m.",
"M.A.",
"m.E.",
"m.M.",
"M.Sc.",
"Mr.",
"n.",
"N.Y.",
"N.Y.C.",
"nat.",
"ö."
"o.",
"o.a.",
"o.ä.",
"o.g.",
"o.k.",
"O.K.",
"p.",
"p.a.",
"p.s.",
"P.S.",
"pers.",
"phil.",
"q.",
"q.e.d.",
"r.",
"R.I.P.",
"rer.",
"s.",
"sen.",
"St.",
"std.",
"t.",
"u.",
"ü.",
"u.a.",
"U.S.",
"U.S.A.",
"U.S.S.",
"v.",
"Vol.",
"vs.",
"w.",
"wiss.",
"x.",
"y.",
"z.",
]
__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"]

81
spacy/de/stop_words.py Normal file
View File

@ -0,0 +1,81 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
á a ab aber ach acht achte achten achter achtes ag alle allein allem allen
aller allerdings alles allgemeinen als also am an andere anderen andern anders
auch auf aus ausser außer ausserdem außerdem
bald bei beide beiden beim beispiel bekannt bereits besonders besser besten bin
bis bisher bist
da dabei dadurch dafür dagegen daher dahin dahinter damals damit danach daneben
dank dann daran darauf daraus darf darfst darin darüber darum darunter das
dasein daselbst dass daß dasselbe davon davor dazu dazwischen dein deine deinem
deiner dem dementsprechend demgegenüber demgemäss demgemäß demselben demzufolge
den denen denn denselben der deren derjenige derjenigen dermassen dermaßen
derselbe derselben des deshalb desselben dessen deswegen dich die diejenige
diejenigen dies diese dieselbe dieselben diesem diesen dieser dieses dir doch
dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft
durfte durften
eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine
einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en
ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch
früher fünf fünfte fünften fünfter fünftes für
gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen
geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige
gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen
großen grosser großer grosses großes gut gute guter gutes
habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier
hin hinter hoch
ich ihm ihn ihnen ihr ihre ihrem ihrer ihres im immer in indem infolgedessen
ins irgend ist
ja jahr jahre jahren je jede jedem jeden jeder jedermann jedermanns jedoch
jemand jemandem jemanden jene jenem jenen jener jenes jetzt
kam kann kannst kaum kein keine keinem keinen keiner kleine kleinen kleiner
kleines kommen kommt können könnt konnte könnte konnten kurz
lang lange leicht leider lieber los
machen macht machte mag magst man manche manchem manchen mancher manches mehr
mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel
mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst
musste mussten
na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter
neuntes nicht nichts nie niemand niemandem niemanden noch nun nur
ob oben oder offen oft ohne
recht rechte rechten rechter rechtes richtig rund
sagt sagte sah satt schlecht schon sechs sechste sechsten sechster sechstes
sehr sei seid seien sein seine seinem seinen seiner seines seit seitdem selbst
selbst sich sie sieben siebente siebenten siebenter siebentes siebte siebten
siebter siebtes sind so solang solche solchem solchen solcher solches soll
sollen sollte sollten sondern sonst sowie später statt
tag tage tagen tat teil tel trotzdem tun
über überhaupt übrigens uhr um und uns unser unsere unserer unter
vergangene vergangenen viel viele vielem vielen vielleicht vier vierte vierten
vierter viertes vom von vor
wahr während währenddem währenddessen wann war wäre waren wart warum was wegen
weil weit weiter weitere weiteren weiteres welche welchem welchen welcher
welches wem wen wenig wenige weniger weniges wenigstens wenn wer werde werden
werdet wessen wie wieder will willst wir wird wirklich wirst wo wohl wollen
wollt wollte wollten worden wurde würde wurden würden
zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur
zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen
""".split())

65
spacy/de/tag_map.py Normal file
View File

@ -0,0 +1,65 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"$(": {POS: PUNCT, "PunctType": "brck"},
"$,": {POS: PUNCT, "PunctType": "comm"},
"$.": {POS: PUNCT, "PunctType": "peri"},
"ADJA": {POS: ADJ},
"ADJD": {POS: ADJ, "Variant": "short"},
"ADV": {POS: ADV},
"APPO": {POS: ADP, "AdpType": "post"},
"APPR": {POS: ADP, "AdpType": "prep"},
"APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"},
"APZR": {POS: ADP, "AdpType": "circ"},
"ART": {POS: DET, "PronType": "art"},
"CARD": {POS: NUM, "NumType": "card"},
"FM": {POS: X, "Foreign": "yes"},
"ITJ": {POS: INTJ},
"KOKOM": {POS: CONJ, "ConjType": "comp"},
"KON": {POS: CONJ},
"KOUI": {POS: SCONJ},
"KOUS": {POS: SCONJ},
"NE": {POS: PROPN},
"NNE": {POS: PROPN},
"NN": {POS: NOUN},
"PAV": {POS: ADV, "PronType": "dem"},
"PROAV": {POS: ADV, "PronType": "dem"},
"PDAT": {POS: DET, "PronType": "dem"},
"PDS": {POS: PRON, "PronType": "dem"},
"PIAT": {POS: DET, "PronType": "ind|neg|tot"},
"PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"},
"PIS": {POS: PRON, "PronType": "ind|neg|tot"},
"PPER": {POS: PRON, "PronType": "prs"},
"PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"},
"PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"},
"PRELAT": {POS: DET, "PronType": "rel"},
"PRELS": {POS: PRON, "PronType": "rel"},
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Negative": "yes"},
"PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"},
"PWAV": {POS: ADV, "PronType": "int"},
"PWS": {POS: PRON, "PronType": "int"},
"TRUNC": {POS: X, "Hyph": "yes"},
"VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"},
"VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"},
"VAINF": {POS: AUX, "VerbForm": "inf"},
"VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"},
"VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"},
"VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"},
"VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"},
"VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"},
"VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"},
"VVINF": {POS: VERB, "VerbForm": "inf"},
"VVIZU": {POS: VERB, "VerbForm": "inf"},
"VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
"XY": {POS: X},
"SP": {POS: SPACE}
}

View File

@ -0,0 +1,629 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = {
"\\n": [
{ORTH: "\\n", LEMMA: "<nl>", TAG: "SP"}
],
"\\t": [
{ORTH: "\\t", LEMMA: "<tab>", TAG: "SP"}
],
"'S": [
{ORTH: "'S", LEMMA: PRON_LEMMA}
],
"'n": [
{ORTH: "'n", LEMMA: "ein"}
],
"'ne": [
{ORTH: "'ne", LEMMA: "eine"}
],
"'nen": [
{ORTH: "'nen", LEMMA: "einen"}
],
"'s": [
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"Abb.": [
{ORTH: "Abb.", LEMMA: "Abbildung"}
],
"Abk.": [
{ORTH: "Abk.", LEMMA: "Abkürzung"}
],
"Abt.": [
{ORTH: "Abt.", LEMMA: "Abteilung"}
],
"Apr.": [
{ORTH: "Apr.", LEMMA: "April"}
],
"Aug.": [
{ORTH: "Aug.", LEMMA: "August"}
],
"Bd.": [
{ORTH: "Bd.", LEMMA: "Band"}
],
"Betr.": [
{ORTH: "Betr.", LEMMA: "Betreff"}
],
"Bf.": [
{ORTH: "Bf.", LEMMA: "Bahnhof"}
],
"Bhf.": [
{ORTH: "Bhf.", LEMMA: "Bahnhof"}
],
"Bsp.": [
{ORTH: "Bsp.", LEMMA: "Beispiel"}
],
"Dez.": [
{ORTH: "Dez.", LEMMA: "Dezember"}
],
"Di.": [
{ORTH: "Di.", LEMMA: "Dienstag"}
],
"Do.": [
{ORTH: "Do.", LEMMA: "Donnerstag"}
],
"Fa.": [
{ORTH: "Fa.", LEMMA: "Firma"}
],
"Fam.": [
{ORTH: "Fam.", LEMMA: "Familie"}
],
"Feb.": [
{ORTH: "Feb.", LEMMA: "Februar"}
],
"Fr.": [
{ORTH: "Fr.", LEMMA: "Frau"}
],
"Frl.": [
{ORTH: "Frl.", LEMMA: "Fräulein"}
],
"Hbf.": [
{ORTH: "Hbf.", LEMMA: "Hauptbahnhof"}
],
"Hr.": [
{ORTH: "Hr.", LEMMA: "Herr"}
],
"Hrn.": [
{ORTH: "Hrn.", LEMMA: "Herr"}
],
"Jan.": [
{ORTH: "Jan.", LEMMA: "Januar"}
],
"Jh.": [
{ORTH: "Jh.", LEMMA: "Jahrhundert"}
],
"Jhd.": [
{ORTH: "Jhd.", LEMMA: "Jahrhundert"}
],
"Jul.": [
{ORTH: "Jul.", LEMMA: "Juli"}
],
"Jun.": [
{ORTH: "Jun.", LEMMA: "Juni"}
],
"Mi.": [
{ORTH: "Mi.", LEMMA: "Mittwoch"}
],
"Mio.": [
{ORTH: "Mio.", LEMMA: "Million"}
],
"Mo.": [
{ORTH: "Mo.", LEMMA: "Montag"}
],
"Mrd.": [
{ORTH: "Mrd.", LEMMA: "Milliarde"}
],
"Mrz.": [
{ORTH: "Mrz.", LEMMA: "März"}
],
"MwSt.": [
{ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"}
],
"Mär.": [
{ORTH: "Mär.", LEMMA: "März"}
],
"Nov.": [
{ORTH: "Nov.", LEMMA: "November"}
],
"Nr.": [
{ORTH: "Nr.", LEMMA: "Nummer"}
],
"Okt.": [
{ORTH: "Okt.", LEMMA: "Oktober"}
],
"Orig.": [
{ORTH: "Orig.", LEMMA: "Original"}
],
"Pkt.": [
{ORTH: "Pkt.", LEMMA: "Punkt"}
],
"Prof.": [
{ORTH: "Prof.", LEMMA: "Professor"}
],
"Red.": [
{ORTH: "Red.", LEMMA: "Redaktion"}
],
"S'": [
{ORTH: "S'", LEMMA: PRON_LEMMA}
],
"Sa.": [
{ORTH: "Sa.", LEMMA: "Samstag"}
],
"Sep.": [
{ORTH: "Sep.", LEMMA: "September"}
],
"Sept.": [
{ORTH: "Sept.", LEMMA: "September"}
],
"So.": [
{ORTH: "So.", LEMMA: "Sonntag"}
],
"Std.": [
{ORTH: "Std.", LEMMA: "Stunde"}
],
"Str.": [
{ORTH: "Str.", LEMMA: "Straße"}
],
"Tel.": [
{ORTH: "Tel.", LEMMA: "Telefon"}
],
"Tsd.": [
{ORTH: "Tsd.", LEMMA: "Tausend"}
],
"Univ.": [
{ORTH: "Univ.", LEMMA: "Universität"}
],
"abzgl.": [
{ORTH: "abzgl.", LEMMA: "abzüglich"}
],
"allg.": [
{ORTH: "allg.", LEMMA: "allgemein"}
],
"auf'm": [
{ORTH: "auf", LEMMA: "auf"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"bspw.": [
{ORTH: "bspw.", LEMMA: "beispielsweise"}
],
"bzgl.": [
{ORTH: "bzgl.", LEMMA: "bezüglich"}
],
"bzw.": [
{ORTH: "bzw.", LEMMA: "beziehungsweise"}
],
"d.h.": [
{ORTH: "d.h.", LEMMA: "das heißt"}
],
"dgl.": [
{ORTH: "dgl.", LEMMA: "dergleichen"}
],
"du's": [
{ORTH: "du", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"ebd.": [
{ORTH: "ebd.", LEMMA: "ebenda"}
],
"eigtl.": [
{ORTH: "eigtl.", LEMMA: "eigentlich"}
],
"engl.": [
{ORTH: "engl.", LEMMA: "englisch"}
],
"er's": [
{ORTH: "er", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"evtl.": [
{ORTH: "evtl.", LEMMA: "eventuell"}
],
"frz.": [
{ORTH: "frz.", LEMMA: "französisch"}
],
"gegr.": [
{ORTH: "gegr.", LEMMA: "gegründet"}
],
"ggf.": [
{ORTH: "ggf.", LEMMA: "gegebenenfalls"}
],
"ggfs.": [
{ORTH: "ggfs.", LEMMA: "gegebenenfalls"}
],
"ggü.": [
{ORTH: "ggü.", LEMMA: "gegenüber"}
],
"hinter'm": [
{ORTH: "hinter", LEMMA: "hinter"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"i.O.": [
{ORTH: "i.O.", LEMMA: "in Ordnung"}
],
"i.d.R.": [
{ORTH: "i.d.R.", LEMMA: "in der Regel"}
],
"ich's": [
{ORTH: "ich", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"ihr's": [
{ORTH: "ihr", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"incl.": [
{ORTH: "incl.", LEMMA: "inklusive"}
],
"inkl.": [
{ORTH: "inkl.", LEMMA: "inklusive"}
],
"insb.": [
{ORTH: "insb.", LEMMA: "insbesondere"}
],
"kath.": [
{ORTH: "kath.", LEMMA: "katholisch"}
],
"lt.": [
{ORTH: "lt.", LEMMA: "laut"}
],
"max.": [
{ORTH: "max.", LEMMA: "maximal"}
],
"min.": [
{ORTH: "min.", LEMMA: "minimal"}
],
"mind.": [
{ORTH: "mind.", LEMMA: "mindestens"}
],
"mtl.": [
{ORTH: "mtl.", LEMMA: "monatlich"}
],
"n.Chr.": [
{ORTH: "n.Chr.", LEMMA: "nach Christus"}
],
"orig.": [
{ORTH: "orig.", LEMMA: "original"}
],
"röm.": [
{ORTH: "röm.", LEMMA: "römisch"}
],
"s'": [
{ORTH: "s'", LEMMA: PRON_LEMMA}
],
"s.o.": [
{ORTH: "s.o.", LEMMA: "siehe oben"}
],
"sie's": [
{ORTH: "sie", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"sog.": [
{ORTH: "sog.", LEMMA: "so genannt"}
],
"stellv.": [
{ORTH: "stellv.", LEMMA: "stellvertretend"}
],
"tägl.": [
{ORTH: "tägl.", LEMMA: "täglich"}
],
"u.U.": [
{ORTH: "u.U.", LEMMA: "unter Umständen"}
],
"u.s.w.": [
{ORTH: "u.s.w.", LEMMA: "und so weiter"}
],
"u.v.m.": [
{ORTH: "u.v.m.", LEMMA: "und vieles mehr"}
],
"unter'm": [
{ORTH: "unter", LEMMA: "unter"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"usf.": [
{ORTH: "usf.", LEMMA: "und so fort"}
],
"usw.": [
{ORTH: "usw.", LEMMA: "und so weiter"}
],
"uvm.": [
{ORTH: "uvm.", LEMMA: "und vieles mehr"}
],
"v.Chr.": [
{ORTH: "v.Chr.", LEMMA: "vor Christus"}
],
"v.a.": [
{ORTH: "v.a.", LEMMA: "vor allem"}
],
"v.l.n.r.": [
{ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"}
],
"vgl.": [
{ORTH: "vgl.", LEMMA: "vergleiche"}
],
"vllt.": [
{ORTH: "vllt.", LEMMA: "vielleicht"}
],
"vlt.": [
{ORTH: "vlt.", LEMMA: "vielleicht"}
],
"vor'm": [
{ORTH: "vor", LEMMA: "vor"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
],
"wir's": [
{ORTH: "wir", LEMMA: PRON_LEMMA},
{ORTH: "'s", LEMMA: PRON_LEMMA}
],
"z.B.": [
{ORTH: "z.B.", LEMMA: "zum Beispiel"}
],
"z.Bsp.": [
{ORTH: "z.Bsp.", LEMMA: "zum Beispiel"}
],
"z.T.": [
{ORTH: "z.T.", LEMMA: "zum Teil"}
],
"z.Z.": [
{ORTH: "z.Z.", LEMMA: "zur Zeit"}
],
"z.Zt.": [
{ORTH: "z.Zt.", LEMMA: "zur Zeit"}
],
"z.b.": [
{ORTH: "z.b.", LEMMA: "zum Beispiel"}
],
"zzgl.": [
{ORTH: "zzgl.", LEMMA: "zuzüglich"}
],
"österr.": [
{ORTH: "österr.", LEMMA: "österreichisch"}
],
"über'm": [
{ORTH: "über", LEMMA: "über"},
{ORTH: "'m", LEMMA: PRON_LEMMA}
]
}
ORTH_ONLY = [
"'",
"\\\")",
"<space>",
"a.",
"ä.",
"A.C.",
"a.D.",
"A.D.",
"A.G.",
"a.M.",
"a.Z.",
"Abs.",
"adv.",
"al.",
"b.",
"B.A.",
"B.Sc.",
"betr.",
"biol.",
"Biol.",
"c.",
"ca.",
"Chr.",
"Cie.",
"co.",
"Co.",
"d.",
"D.C.",
"Dipl.-Ing.",
"Dipl.",
"Dr.",
"e.",
"e.g.",
"e.V.",
"ehem.",
"entspr.",
"erm.",
"etc.",
"ev.",
"f.",
"g.",
"G.m.b.H.",
"geb.",
"Gebr.",
"gem.",
"h.",
"h.c.",
"Hg.",
"hrsg.",
"Hrsg.",
"i.",
"i.A.",
"i.e.",
"i.G.",
"i.Tr.",
"i.V.",
"Ing.",
"j.",
"jr.",
"Jr.",
"jun.",
"jur.",
"k.",
"K.O.",
"l.",
"L.A.",
"lat.",
"m.",
"M.A.",
"m.E.",
"m.M.",
"M.Sc.",
"Mr.",
"n.",
"N.Y.",
"N.Y.C.",
"nat.",
"ö."
"o.",
"o.a.",
"o.ä.",
"o.g.",
"o.k.",
"O.K.",
"p.",
"p.a.",
"p.s.",
"P.S.",
"pers.",
"phil.",
"q.",
"q.e.d.",
"r.",
"R.I.P.",
"rer.",
"s.",
"sen.",
"St.",
"std.",
"t.",
"u.",
"ü.",
"u.a.",
"U.S.",
"U.S.A.",
"U.S.S.",
"v.",
"Vol.",
"vs.",
"w.",
"wiss.",
"x.",
"y.",
"z.",
]

View File

@ -4,34 +4,12 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from .. import util
from ..lemmatizer import Lemmatizer
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import expand_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
from .language_data import get_time_exc
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", ""))
from .language_data import *
class English(Language):
@ -42,8 +20,6 @@ class English(Language):
lex_attr_getters[LANG] = lambda text: 'en'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES

File diff suppressed because it is too large Load Diff

42
spacy/en/lemma_rules.py Normal file
View File

@ -0,0 +1,42 @@
# encoding: utf8
from __future__ import unicode_literals
LEMMA_RULES = {
"noun": [
["s", ""],
["ses", "s"],
["ves", "f"],
["xes", "x"],
["zes", "z"],
["ches", "ch"],
["shes", "sh"],
["men", "man"],
["ies", "y"]
],
"verb": [
["s", ""],
["ies", "y"],
["es", "e"],
["es", ""],
["ed", "e"],
["ed", ""],
["ing", "e"],
["ing", ""]
],
"adj": [
["er", ""],
["est", ""],
["er", "e"],
["est", "e"]
],
"punct": [
["", "\""],
["", "\""],
["\u2018", "'"],
["\u2019", "'"]
]
}

67
spacy/en/morph_rules.py Normal file
View File

@ -0,0 +1,67 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
MORPH_RULES = {
"PRP": {
"I": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"},
"me": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"},
"you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
"he": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"},
"him": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"},
"she": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"},
"her": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"},
"it": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"},
"we": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"},
"us": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"},
"they": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"},
"them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"},
"mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"},
"his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
"hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
"its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
"ours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
"yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
"theirs": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
"myself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
"yourself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
"himself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"},
"herself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem", "Reflex": "Yes"},
"itself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"},
"themself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
"ourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"},
"yourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
"themselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"}
},
"PRP$": {
"my": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"},
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
"his": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"},
"her": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"},
"its": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
"our": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"},
"their": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}
},
"VBZ": {
"am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"},
"are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"},
"is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"},
},
"VBP": {
"are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}
},
"VBD": {
"was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"},
"were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"}
}
}

67
spacy/en/stop_words.py Normal file
View File

@ -0,0 +1,67 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough etc even ever every
everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full
further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred
i if in inc indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves
out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such
take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split())

64
spacy/en/tag_map.py Normal file
View File

@ -0,0 +1,64 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
".": {POS: PUNCT, "PunctType": "peri"},
",": {POS: PUNCT, "PunctType": "comm"},
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT},
"$": {POS: SYM, "Other": {"SymType": "currency"}},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
"AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CONJ, "ConjType": "coor"},
"CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET},
"EX": {POS: ADV, "AdvType": "ex"},
"FW": {POS: X, "Foreign": "yes"},
"HYPH": {POS: PUNCT, "PunctType": "dash"},
"IN": {POS: ADP},
"JJ": {POS: ADJ, "Degree": "pos"},
"JJR": {POS: ADJ, "Degree": "comp"},
"JJS": {POS: ADJ, "Degree": "sup"},
"LS": {POS: PUNCT, "NumType": "ord"},
"MD": {POS: VERB, "VerbType": "mod"},
"NIL": {POS: ""},
"NN": {POS: NOUN, "Number": "sing"},
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
"NNS": {POS: NOUN, "Number": "plur"},
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
"POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"},
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
"RB": {POS: ADV, "Degree": "pos"},
"RBR": {POS: ADV, "Degree": "comp"},
"RBS": {POS: ADV, "Degree": "sup"},
"RP": {POS: PART},
"SYM": {POS: SYM},
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
"UH": {POS: INTJ},
"VB": {POS: VERB, "VerbForm": "inf"},
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
"WDT": {POS: ADJ, "PronType": "int|rel"},
"WP": {POS: NOUN, "PronType": "int|rel"},
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
"WRB": {POS: ADV, "PronType": "int|rel"},
"SP": {POS: SPACE},
"ADD": {POS: X},
"NFP": {POS: PUNCT},
"GW": {POS: X},
"XX": {POS: X},
"BES": {POS: VERB},
"HVS": {POS: VERB}
}

File diff suppressed because it is too large Load Diff

View File

@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class Spanish(Language):
@ -34,8 +17,4 @@ class Spanish(Language):
lex_attr_getters[LANG] = lambda text: 'es'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,408 +1,19 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY
TAG_MAP = {
}
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
STOP_WORDS = set(STOP_WORDS)
STOP_WORDS = set("""
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
aseguró asi así atras aun aunque ayer añadió aún
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
bajo bastante bien breve buen buena buenas bueno buenos
cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
conmigo conocer conseguimos conseguir considera consideró consigo consigue
consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
cuánto cuántos cómo
da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
demás dentro deprisa desde despacio despues después detras detrás dia dias dice
dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
días dónde
ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
estamos estan estar estará estas este esto estos estoy estuvo está están ex
excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
éstos
fin final fue fuera fueron fui fuimos
general gran grandes gueno
ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
hizo horas hoy hubo
igual incluso indicó informo informó intenta intentais intentamos intentan
intentar intentas intento ir
junto
la lado largo las le lejos les llegó lleva llevar lo los luego lugar
mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
muchas mucho muchos muy más mía mías mío míos
nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
ocho os otra otras otro otros
pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
podrán podría podrían poner por porque posible primer primera primero primeros
principalmente pronto propia propias propio propios proximo próximo próximos
pudo pueda puede pueden puedo pues
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
raras realizado realizar realizó repente respecto
sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
soyos su supuesto sus suya suyas suyo sólo
tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
tuyos
ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
última últimas último últimos
va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
ya yo
""".split())
TOKENIZER_EXCEPTIONS = {
"accidentarse": [
{ORTH: "accidentar", LEMMA: "accidentar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"aceptarlo": [
{ORTH: "aceptar", LEMMA: "aceptar", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"acompañarla": [
{ORTH: "acompañar", LEMMA: "acompañar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"advertirle": [
{ORTH: "advertir", LEMMA: "advertir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"al": [
{ORTH: "a", LEMMA: "a", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"anunciarnos": [
{ORTH: "anunciar", LEMMA: "anunciar", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"asegurándole": [
{ORTH: "asegurando", LEMMA: "asegurar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"considerarle": [
{ORTH: "considerar", LEMMA: "considerar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirle": [
{ORTH: "decir", LEMMA: "decir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirles": [
{ORTH: "decir", LEMMA: "decir", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirte": [
{ORTH: "Decir", LEMMA: "decir", POS: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejarla": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejarnos": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejándole": [
{ORTH: "dejando", LEMMA: "dejar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"del": [
{ORTH: "de", LEMMA: "de", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"demostrarles": [
{ORTH: "demostrar", LEMMA: "demostrar", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"diciéndole": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"diciéndoles": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"diferenciarse": [
{ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX},
{ORTH: "se", LEMMA: "él", POS: PRON}
],
"divirtiéndome": [
{ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"ensanchándose": [
{ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"explicarles": [
{ORTH: "explicar", LEMMA: "explicar", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberla": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlas": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlo": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlos": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberme": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberse": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"hacerle": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"hacerles": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"hallarse": [
{ORTH: "hallar", LEMMA: "hallar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"imaginaros": [
{ORTH: "imaginar", LEMMA: "imaginar", POS: AUX},
{ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON}
],
"insinuarle": [
{ORTH: "insinuar", LEMMA: "insinuar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"justificarla": [
{ORTH: "justificar", LEMMA: "justificar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerlas": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerlos": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerme": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"pasarte": [
{ORTH: "pasar", LEMMA: "pasar", POS: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
],
"pedirle": [
{ORTH: "pedir", LEMMA: "pedir", POS: AUX},
{ORTH: "le", LEMMA: "él", POS: PRON}
],
"pel": [
{ORTH: "per", LEMMA: "per", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"pidiéndonos": [
{ORTH: "pidiendo", LEMMA: "pedir", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"poderle": [
{ORTH: "poder", LEMMA: "poder", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"preguntarse": [
{ORTH: "preguntar", LEMMA: "preguntar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"preguntándose": [
{ORTH: "preguntando", LEMMA: "preguntar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"presentarla": [
{ORTH: "presentar", LEMMA: "presentar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"pudiéndolo": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"pudiéndose": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"quererle": [
{ORTH: "querer", LEMMA: "querer", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"rasgarse": [
{ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"repetirlo": [
{ORTH: "repetir", LEMMA: "repetir", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"robarle": [
{ORTH: "robar", LEMMA: "robar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"seguirlos": [
{ORTH: "seguir", LEMMA: "seguir", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"serle": [
{ORTH: "ser", LEMMA: "ser", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"serlo": [
{ORTH: "ser", LEMMA: "ser", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"señalándole": [
{ORTH: "señalando", LEMMA: "señalar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"suplicarle": [
{ORTH: "suplicar", LEMMA: "suplicar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"tenerlos": [
{ORTH: "tener", LEMMA: "tener", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"vengarse": [
{ORTH: "vengar", LEMMA: "vengar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"verla": [
{ORTH: "ver", LEMMA: "ver", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"verle": [
{ORTH: "ver", LEMMA: "ver", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"volverlo": [
{ORTH: "volver", LEMMA: "volver", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
]
}
ORTH_ONLY = [
]
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

84
spacy/es/stop_words.py Normal file
View File

@ -0,0 +1,84 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí
al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos
antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas
aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo
aseguró asi así atras aun aunque ayer añadió aún
bajo bastante bien breve buen buena buenas bueno buenos
cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con
conmigo conocer conseguimos conseguir considera consideró consigo consigue
consiguen consigues contigo contra cosas creo cual cuales cualquier cuando
cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas
cuánto cuántos cómo
da dado dan dar de debajo debe deben debido decir dejó del delante demasiado
demás dentro deprisa desde despacio despues después detras detrás dia dias dice
dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día
días dónde
ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas
empleo en encima encuentra enfrente enseguida entonces entre era eramos eran
eras eres es esa esas ese eso esos esta estaba estaban estado estados estais
estamos estan estar estará estas este esto estos estoy estuvo está están ex
excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste
éstos
fin final fue fuera fueron fui fuimos
general gran grandes gueno
ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer
hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron
hizo horas hoy hubo
igual incluso indicó informo informó intenta intentais intentamos intentan
intentar intentas intento ir
junto
la lado largo las le lejos les llegó lleva llevar lo los luego lugar
mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi
mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha
muchas mucho muchos muy más mía mías mío míos
nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros
nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca
ocho os otra otras otro otros
pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas
poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá
podrán podría podrían poner por porque posible primer primera primero primeros
principalmente pronto propia propias propio propios proximo próximo próximos
pudo pueda puede pueden puedo pues
qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué
raras realizado realizar realizó repente respecto
sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo
según seis ser sera será serán sería señaló si sido siempre siendo siete sigue
siguiente sin sino sobre sois sola solamente solas solo solos somos son soy
soyos su supuesto sus suya suyas suyo sólo
tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis
tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda
todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan
trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo
tuyos
ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes
última últimas último últimos
va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero
vez vosotras vosotros voy vuestra vuestras vuestro vuestros
ya yo
""".split())

View File

@ -0,0 +1,318 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
TOKENIZER_EXCEPTIONS = {
"accidentarse": [
{ORTH: "accidentar", LEMMA: "accidentar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"aceptarlo": [
{ORTH: "aceptar", LEMMA: "aceptar", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"acompañarla": [
{ORTH: "acompañar", LEMMA: "acompañar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"advertirle": [
{ORTH: "advertir", LEMMA: "advertir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"al": [
{ORTH: "a", LEMMA: "a", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"anunciarnos": [
{ORTH: "anunciar", LEMMA: "anunciar", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"asegurándole": [
{ORTH: "asegurando", LEMMA: "asegurar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"considerarle": [
{ORTH: "considerar", LEMMA: "considerar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirle": [
{ORTH: "decir", LEMMA: "decir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirles": [
{ORTH: "decir", LEMMA: "decir", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"decirte": [
{ORTH: "Decir", LEMMA: "decir", POS: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejarla": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejarnos": [
{ORTH: "dejar", LEMMA: "dejar", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"dejándole": [
{ORTH: "dejando", LEMMA: "dejar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"del": [
{ORTH: "de", LEMMA: "de", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"demostrarles": [
{ORTH: "demostrar", LEMMA: "demostrar", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"diciéndole": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"diciéndoles": [
{ORTH: "diciendo", LEMMA: "decir", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"diferenciarse": [
{ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX},
{ORTH: "se", LEMMA: "él", POS: PRON}
],
"divirtiéndome": [
{ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"ensanchándose": [
{ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"explicarles": [
{ORTH: "explicar", LEMMA: "explicar", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberla": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlas": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlo": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberlos": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberme": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"haberse": [
{ORTH: "haber", LEMMA: "haber", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"hacerle": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"hacerles": [
{ORTH: "hacer", LEMMA: "hacer", POS: AUX},
{ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON}
],
"hallarse": [
{ORTH: "hallar", LEMMA: "hallar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"imaginaros": [
{ORTH: "imaginar", LEMMA: "imaginar", POS: AUX},
{ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON}
],
"insinuarle": [
{ORTH: "insinuar", LEMMA: "insinuar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"justificarla": [
{ORTH: "justificar", LEMMA: "justificar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerlas": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerlos": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"mantenerme": [
{ORTH: "mantener", LEMMA: "mantener", POS: AUX},
{ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON}
],
"pasarte": [
{ORTH: "pasar", LEMMA: "pasar", POS: AUX},
{ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON}
],
"pedirle": [
{ORTH: "pedir", LEMMA: "pedir", POS: AUX},
{ORTH: "le", LEMMA: "él", POS: PRON}
],
"pel": [
{ORTH: "per", LEMMA: "per", POS: ADP},
{ORTH: "el", LEMMA: "el", POS: DET}
],
"pidiéndonos": [
{ORTH: "pidiendo", LEMMA: "pedir", POS: AUX},
{ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON}
],
"poderle": [
{ORTH: "poder", LEMMA: "poder", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"preguntarse": [
{ORTH: "preguntar", LEMMA: "preguntar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"preguntándose": [
{ORTH: "preguntando", LEMMA: "preguntar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"presentarla": [
{ORTH: "presentar", LEMMA: "presentar", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"pudiéndolo": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"pudiéndose": [
{ORTH: "pudiendo", LEMMA: "poder", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"quererle": [
{ORTH: "querer", LEMMA: "querer", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"rasgarse": [
{ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"repetirlo": [
{ORTH: "repetir", LEMMA: "repetir", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"robarle": [
{ORTH: "robar", LEMMA: "robar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"seguirlos": [
{ORTH: "seguir", LEMMA: "seguir", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"serle": [
{ORTH: "ser", LEMMA: "ser", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"serlo": [
{ORTH: "ser", LEMMA: "ser", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
],
"señalándole": [
{ORTH: "señalando", LEMMA: "señalar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"suplicarle": [
{ORTH: "suplicar", LEMMA: "suplicar", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"tenerlos": [
{ORTH: "tener", LEMMA: "tener", POS: AUX},
{ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON}
],
"vengarse": [
{ORTH: "vengar", LEMMA: "vengar", POS: AUX},
{ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON}
],
"verla": [
{ORTH: "ver", LEMMA: "ver", POS: AUX},
{ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON}
],
"verle": [
{ORTH: "ver", LEMMA: "ver", POS: AUX},
{ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON}
],
"volverlo": [
{ORTH: "volver", LEMMA: "volver", POS: AUX},
{ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON}
]
}
ORTH_ONLY = [
]

View File

@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class French(Language):
@ -34,8 +17,4 @@ class French(Language):
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,109 +1,14 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import strings_to_exc
from .stop_words import STOP_WORDS
TAG_MAP = {
}
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
STOP_WORDS = set("""
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
allô alors anterieur anterieure anterieures apres après as assez attendu au
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient
avais avait avant avec avoir avons ayant
bah bas basee bat beau beaucoup bien bigre boum bravo brrr
ça car ce ceci cela celle celle-ci celle- celles celles-ci celles- celui
celui-ci celui- cent cependant certain certaine certaines certains certes ces
cet cette ceux ceux-ci ceux- chacun chacune chaque cher chers chez chiche
chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac
clic combien comme comment comparable comparables compris concernant contre
couic crac
da dans de debout dedans dehors deja delà depuis dernier derniere derriere
derrière des desormais desquelles desquels dessous dessus deux deuxième
deuxièmement devant devers devra different differentes differents différent
différente différentes différents dire directe directement dit dite dits divers
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
douze douzième dring du duquel durant dès désormais
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
enfin entre envers environ es ès est et etaient étaient etais étais etait était
etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso
exterieur
fais faisaient faisant fait façon feront fi flac floc font
gens
ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum
hurrah hélas i il ils importe
je jusqu jusque juste
la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps
lors lorsque lui lui-meme lui-même lès
ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien
mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins
mon moyennant multiple multiples même mêmes
na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf
neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau
nul néanmoins nôtre nôtres
o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre
ouvert ouverte ouverts
paf pan par parce parfois parle parlent parler parmi parseme partant
particulier particulière particulièrement pas passé pendant pense permet
personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus
plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi
pourrais pourrait pouvait prealable precisement premier première premièrement
pres probable probante procedant proche près psitt pu puis puisque pur pure
qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
quelques quels qui quiconque quinze quoi quoique
rare rarement rares relative relativement remarquable rend rendre restant reste
restent restrictif retour revoici revoilà rien
sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient
semble semblent sent sept septième sera seraient serait seront ses seul seule
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
soixante son sont sous souvent specifique specifiques speculatif stop
strictement subtiles suffisant suffisante suffit suis suit suivant suivante
suivantes suivants suivre superpose sur surtout
ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente
tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous
tout toute toutefois toutes treize trente tres trois troisième troisièmement
trop très tsoin tsouin tu
un une unes uniformement unique uniques uns
va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos
votre vous vous-mêmes vu vôtre vôtres
zut
""".split())
TOKENIZER_EXCEPTIONS = {
}
ORTH_ONLY = {
}
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

88
spacy/fr/stop_words.py Normal file
View File

@ -0,0 +1,88 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons
allô alors anterieur anterieure anterieures apres après as assez attendu au
aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront
aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient
avais avait avant avec avoir avons ayant
bah bas basee bat beau beaucoup bien bigre boum bravo brrr
ça car ce ceci cela celle celle-ci celle- celles celles-ci celles- celui
celui-ci celui- cent cependant certain certaine certaines certains certes ces
cet cette ceux ceux-ci ceux- chacun chacune chaque cher chers chez chiche
chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac
clic combien comme comment comparable comparables compris concernant contre
couic crac
da dans de debout dedans dehors deja delà depuis dernier derniere derriere
derrière des desormais desquelles desquels dessous dessus deux deuxième
deuxièmement devant devers devra different differentes differents différent
différente différentes différents dire directe directement dit dite dits divers
diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont
douze douzième dring du duquel durant dès désormais
effet egale egalement egales eh elle elle-même elles elles-mêmes en encore
enfin entre envers environ es ès est et etaient étaient etais étais etait était
etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso
exterieur
fais faisaient faisant fait façon feront fi flac floc font
gens
ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum
hurrah hélas i il ils importe
je jusqu jusque juste
la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps
lors lorsque lui lui-meme lui-même lès
ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien
mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins
mon moyennant multiple multiples même mêmes
na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf
neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau
nul néanmoins nôtre nôtres
o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre
ouvert ouverte ouverts
paf pan par parce parfois parle parlent parler parmi parseme partant
particulier particulière particulièrement pas passé pendant pense permet
personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus
plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi
pourrais pourrait pouvait prealable precisement premier première premièrement
pres probable probante procedant proche près psitt pu puis puisque pur pure
qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt
quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque
quelques quels qui quiconque quinze quoi quoique
rare rarement rares relative relativement remarquable rend rendre restant reste
restent restrictif retour revoici revoilà rien
sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient
semble semblent sent sept septième sera seraient serait seront ses seul seule
seulement si sien sienne siennes siens sinon six sixième soi soi-même soit
soixante son sont sous souvent specifique specifiques speculatif stop
strictement subtiles suffisant suffisante suffit suis suit suivant suivante
suivantes suivants suivre superpose sur surtout
ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente
tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous
tout toute toutefois toutes treize trente tres trois troisième troisièmement
trop très tsoin tsouin tu
un une unes uniformement unique uniques uns
va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos
votre vous vous-mêmes vu vôtre vôtres
zut
""".split())

View File

@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class Italian(Language):
@ -34,8 +17,4 @@ class Italian(Language):
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,106 +1,14 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TAG_MAP = {
}
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
STOP_WORDS = set("""
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri
altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai
attesa attraverso avanti avemmo avendo avente aver avere averlo avesse
avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate
avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste
avresti avrete avrà avrò avuta avute avuti avuto
basta bene benissimo brava bravo
casa caso cento certa certe certi certo che chi chicchessia chiunque ci
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
cogli coi col colei coll coloro colui come cominci comunque con concernente
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
dei del dell della delle dello dentro detto deve di dice dietro dire
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
dunque durante
ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
facevo fai fanno farai faranno fare farebbe farebbero farei faremmo faremo
fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
frattempo fu fui fummo fuori furono futuro generale
gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
grande grazie gruppo
ha haha hai hanno ho
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
la lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
ma macche magari maggior mai male malgrado malissimo mancanza marche me
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
nostri nostro novanta nove nulla nuovo
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
paese parecchi parecchie parecchio parte partendo peccato peggio per perche
perché percio perciò perfino pero persino persone però piedi pieno piglia piu
piuttosto più po pochissimo poco poi poiche possa possedere posteriore posto
potrebbe preferibilmente presa press prima primo principalmente probabilmente
proprio puo può pure purtroppo
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
questa queste questi questo qui quindi
realmente recente recentemente registrazione relativo riecco salvo
sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
stanno starai staranno starebbe starebbero starei staremmo staremo stareste
staresti starete starà starò stata state stati stato stava stavamo stavano
stavate stavi stavo stemmo stessa stesse stessero stessi stessimo stesso
steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
sullo suo suoi
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
uguali ulteriore ultimo un una uno uomo
va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
vostra vostre vostri vostro
""".split())
TOKENIZER_EXCEPTIONS = {
}
ORTH_ONLY = {
}
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

85
spacy/it/stop_words.py Normal file
View File

@ -0,0 +1,85 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl
agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri
altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai
attesa attraverso avanti avemmo avendo avente aver avere averlo avesse
avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate
avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste
avresti avrete avrà avrò avuta avute avuti avuto
basta bene benissimo brava bravo
casa caso cento certa certe certi certo che chi chicchessia chiunque ci
ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto
cogli coi col colei coll coloro colui come cominci comunque con concernente
conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui
da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli
dei del dell della delle dello dentro detto deve di dice dietro dire
dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due
dunque durante
ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era
erano eravamo eravate eri ero esempio esse essendo esser essere essi ex
fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero
facessi facessimo faceste facesti faceva facevamo facevano facevate facevi
facevo fai fanno farai faranno fare farebbe farebbero farei faremmo faremo
fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente
finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra
frattempo fu fui fummo fuori furono futuro generale
gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo
grande grazie gruppo
ha haha hai hanno ho
ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io
la lasciato lato lavoro le lei li lo lontano loro lui lungo luogo
ma macche magari maggior mai male malgrado malissimo mancanza marche me
medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi
milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto
nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun
nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre
nostri nostro novanta nove nulla nuovo
od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto
paese parecchi parecchie parecchio parte partendo peccato peggio per perche
perché percio perciò perfino pero persino persone però piedi pieno piglia piu
piuttosto più po pochissimo poco poi poiche possa possedere posteriore posto
potrebbe preferibilmente presa press prima primo principalmente probabilmente
proprio puo può pure purtroppo
qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante
quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest
questa queste questi questo qui quindi
realmente recente recentemente registrazione relativo riecco salvo
sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste
saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei
sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate
siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando
stanno starai staranno starebbe starebbero starei staremmo staremo stareste
staresti starete starà starò stata state stati stato stava stavamo stavano
stavate stavi stavo stemmo stessa stesse stessero stessi stessimo stesso
steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
sullo suo suoi
tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
uguali ulteriore ultimo un una uno uomo
va vale vari varia varie vario verso vi via vicino visto vita voi volta volte
vostra vostre vostri vostro
""".split())

View File

@ -21,6 +21,7 @@ from .matcher import Matcher
from . import attrs
from . import orth
from . import util
from . import language_data
from .lemmatizer import Lemmatizer
from .train import Trainer
@ -38,7 +39,7 @@ class BaseDefaults(object):
if nlp is None or nlp.path is None:
return Lemmatizer({}, {}, {})
else:
return Lemmatizer.load(nlp.path)
return Lemmatizer.load(nlp.path, rules=self.lemma_rules)
@classmethod
def create_vocab(cls, nlp=None):
@ -53,7 +54,7 @@ class BaseDefaults(object):
else:
return Vocab.load(nlp.path, lex_attr_getters=cls.lex_attr_getters,
tag_map=cls.tag_map, lemmatizer=lemmatizer)
@classmethod
def add_vectors(cls, nlp=None):
if nlp is None or nlp.path is None:
@ -140,25 +141,27 @@ class BaseDefaults(object):
if nlp.entity:
pipeline.append(nlp.entity)
return pipeline
prefixes = tuple()
suffixes = tuple()
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
infixes = tuple()
tag_map = {}
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
tagger_features = Tagger.feature_templates # TODO -- fix this
stop_words = set()
lemma_rules = {}
lex_attr_getters = {
attrs.LOWER: lambda string: string.lower(),
attrs.NORM: lambda string: string,
@ -257,7 +260,7 @@ class Language(object):
path = util.match_best_version(self.lang, '', util.get_data_path())
self.path = path
self.vocab = self.Defaults.create_vocab(self) \
if 'vocab' not in overrides \
else overrides['vocab']
@ -299,7 +302,7 @@ class Language(object):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
@ -327,9 +330,9 @@ class Language(object):
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
'''Process texts as a stream, and yield Doc objects in order.
Supports GIL-free multi-threading.
Arguments:
texts (iterator)
tag (bool)
@ -352,7 +355,7 @@ class Language(object):
path = self.path
elif isinstance(path, basestring):
path = pathlib.Path(path)
if self.tagger:
self.tagger.model.end_training()
self.tagger.model.dump(str(path / 'pos' / 'model'))
@ -362,7 +365,7 @@ class Language(object):
if self.entity:
self.entity.model.end_training()
self.entity.model.dump(str(path / 'ner' / 'model'))
strings_loc = path / 'vocab' / 'strings.json'
with strings_loc.open('w', encoding='utf8') as file_:
self.vocab.strings.dump(file_)

View File

@ -1,3 +1,5 @@
from .emoticons import *
from .punctuation import *
from .tag_map import *
from .entity_rules import *
from .util import *

View File

@ -0,0 +1,206 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from .util import ENT_ID
ENTITY_RULES = [
{
ENT_ID: "Reddit",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "reddit"}]
]
},
{
ENT_ID: "Linux",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "linux"}]
]
},
{
ENT_ID: "Haskell",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "haskell"}],
]
},
{
ENT_ID: "HaskellCurry",
"attrs": {ENT_TYPE: "PERSON"},
"patterns": [
[{LOWER: "haskell"}, {LOWER: "curry"}]
]
},
{
ENT_ID: "Javascript",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "javascript"}],
]
},
{
ENT_ID: "CSS",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "css"}],
[{LOWER: "css3"}],
]
},
{
ENT_ID: "HTML",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "html"}],
[{LOWER: "html5"}],
]
},
{
ENT_ID: "Python",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Python"}]
]
},
{
ENT_ID: "Ruby",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Ruby"}]
]
},
{
ENT_ID: "spaCy",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "spacy"}]
]
},
{
ENT_ID: "displaCy",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "displacy"}]
]
},
{
ENT_ID: "Digg",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "digg"}]
]
},
{
ENT_ID: "FoxNews",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "foxnews"}],
[{LOWER: "fox"}, {LOWER: "news"}]
]
},
{
ENT_ID: "Google",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "google"}]
]
},
{
ENT_ID: "Mac",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "mac"}]
]
},
{
ENT_ID: "Wikipedia",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "wikipedia"}]
]
},
{
ENT_ID: "Windows",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Windows"}]
]
},
{
ENT_ID: "Dell",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "dell"}]
]
},
{
ENT_ID: "Facebook",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "facebook"}]
]
},
{
ENT_ID: "Blizzard",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{ORTH: "Blizzard"}]
]
},
{
ENT_ID: "Ubuntu",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{ORTH: "Ubuntu"}]
]
},
{
ENT_ID: "YouTube",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "youtube"}]
]
}
]
FALSE_POSITIVES = [
[{ORTH: "Shit"}],
[{ORTH: "Weed"}],
[{ORTH: "Cool"}],
[{ORTH: "Btw"}],
[{ORTH: "Bah"}],
[{ORTH: "Bullshit"}],
[{ORTH: "Lol"}],
[{ORTH: "Yo"}, {LOWER: "dawg"}],
[{ORTH: "Yay"}],
[{ORTH: "Ahh"}],
[{ORTH: "Yea"}],
[{ORTH: "Bah"}]
]
__all__ = ["ENTITY_RULES", "FALSE_POSITIVES"]

View File

@ -130,4 +130,4 @@ TOKENIZER_INFIXES = r'''
'''.strip().split('\n')
__all__ = [ "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES" ]
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]

View File

@ -0,0 +1,24 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}

View File

@ -5,6 +5,7 @@ from ..symbols import *
PRON_LEMMA = "-PRON-"
ENT_ID = "ent_id"
def update_exc(exc, additions):

View File

@ -4,12 +4,12 @@ import pathlib
import ujson as json
from .symbols import NOUN, VERB, ADJ, PUNCT
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object):
@classmethod
def load(cls, path):
def load(cls, path, rules=None):
index = {}
exc = {}
for pos in ['adj', 'noun', 'verb']:
@ -25,8 +25,11 @@ class Lemmatizer(object):
exc[pos] = read_exc(file_)
else:
exc[pos] = {}
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
rules = json.load(file_)
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
rules = json.load(file_)
elif rules is None:
rules = {}
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
@ -55,7 +58,7 @@ class Lemmatizer(object):
'''Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.'''
morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:

View File

@ -4,11 +4,12 @@ from libc.stdint cimport uint64_t
from .structs cimport TokenC
from .strings cimport StringStore
from .typedefs cimport attr_t
from .typedefs cimport attr_t, flags_t
from .parts_of_speech cimport univ_pos_t
from . cimport symbols
cdef struct RichTagC:
uint64_t morph
int id
@ -37,7 +38,7 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
cpdef enum univ_morph_t:

View File

@ -1,4 +1,7 @@
from os import path
from libc.string cimport memset
from .lemmatizer import Lemmatizer
try:
@ -6,10 +9,11 @@ try:
except ImportError:
import json
from .parts_of_speech import IDS as POS_IDS
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
from .attrs cimport POS, IS_SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .attrs import intify_attrs
def _normalize_props(props):
@ -29,6 +33,7 @@ def _normalize_props(props):
return out
cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
@ -40,12 +45,13 @@ cdef class Morphology:
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
props = _normalize_props(props)
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = props[POS]
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
@ -82,38 +88,51 @@ cdef class Morphology:
token.tag = analysis.tag.name
token.morph = analysis.tag.morph
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
pass
cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
cdef flags_t one = 1
if value:
flags[0] |= one << flag_id
else:
flags[0] &= ~(one << flag_id)
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
'''Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
Arguments:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
'''
tag = self.strings[tag_str]
tag_id = self.reverse_index[tag]
orth = self.strings[orth_str]
rich_tag = self.rich_tags[tag_id]
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
if cached is NULL:
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
elif force:
memset(cached, 0, sizeof(cached[0]))
else:
msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
"to overwrite.")
msg = msg % (tag_str, orth_str)
raise ValueError(msg)
cached.tag = rich_tag
for name_id, value_id in attrs.items():
self.assign_feature(&cached.tag.morph, name_id, value_id)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth,
self.tag_map.get(tag_str, {}))
self._cache.set(tag_id, orth, <void*>cached)
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to (lemma, rich tag)
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef attr_t orth
cdef attr_t tag_id
cdef int pos
cdef RichTagC rich_tag
for tag_str, entries in exc.items():
tag = self.strings[tag_str]
tag_id = self.reverse_index[tag]
rich_tag = self.rich_tags[tag_id]
for form_str, props in entries.items():
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
cached.tag = rich_tag
orth = self.strings[form_str]
for name_str, value_str in props.items():
if name_str == 'L':
cached.lemma = self.strings[value_str]
else:
self.assign_feature(&cached.tag.morph, name_str, value_str)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth,
self.tag_map.get(tag_str, {}))
self._cache.set(tag_id, orth, <void*>cached)
for form_str, attrs in entries.items():
self.add_special_case(tag_str, form_str, attrs)
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth]
@ -128,6 +147,7 @@ cdef class Morphology:
lemma = self.strings[lemma_string]
return lemma
IDS = {
"Animacy_anim": Animacy_anim,
"Animacy_inam": Animacy_inam,

View File

@ -4,39 +4,16 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class Dutch(Language):
lang = 'nl'
class Defaults(Language.Defaults):
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,83 +1,14 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
# TODO insert TAG_MAP for Dutch
TAG_MAP = {
"ADV": {POS: ADV},
"NOUN": {POS: NOUN},
"ADP": {POS: ADP},
"PRON": {POS: PRON},
"SCONJ": {POS: SCONJ},
"PROPN": {POS: PROPN},
"DET": {POS: DET},
"SYM": {POS: SYM},
"INTJ": {POS: INTJ},
"PUNCT": {POS: PUNCT},
"NUM": {POS: NUM},
"AUX": {POS: AUX},
"X": {POS: X},
"CONJ": {POS: CONJ},
"ADJ": {POS: ADJ},
"VERB": {POS: VERB}
}
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt
STOP_WORDS = set("""
aan af al alles als altijd andere
ben bij
daar dan dat de der deze die dit doch doen door dus
een eens en er
ge geen geweest
haar had heb hebben heeft hem het hier hij hoe hun
iemand iets ik in is
ja je
kan kon kunnen
maar me meer men met mij mijn moet
na naar niet niets nog nu
of om omdat ons ook op over
reeds
te tegen toch toen tot
u uit uw
van veel voor
want waren was wat we wel werd wezen wie wij wil worden
zal ze zei zelf zich zij zijn zo zonder zou
""".split())
# TODO Make tokenizer excpetions for Dutch
TOKENIZER_EXCEPTIONS = {
}
ORTH_ONLY = {
}
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

43
spacy/nl/stop_words.py Normal file
View File

@ -0,0 +1,43 @@
# encoding: utf8
from __future__ import unicode_literals
# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt
STOP_WORDS = set("""
aan af al alles als altijd andere
ben bij
daar dan dat de der deze die dit doch doen door dus
een eens en er
ge geen geweest
haar had heb hebben heeft hem het hier hij hoe hun
iemand iets ik in is
ja je
kan kon kunnen
maar me meer men met mij mijn moet
na naar niet niets nog nu
of om omdat ons ook op over
reeds
te tegen toch toen tot
u uit uw
van veel voor
want waren was wat we wel werd wezen wie wij wil worden
zal ze zei zelf zich zij zijn zo zonder zou
""".split())

View File

@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
from . import language_data
from ..attrs import LANG
from ..language_data import update_exc
from ..language_data import strings_to_exc
from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES)
TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES)
TAG_MAP = dict(language_data.TAG_MAP)
STOP_WORDS = set(language_data.STOP_WORDS)
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS))
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
from .language_data import *
class Portuguese(Language):
@ -34,8 +17,4 @@ class Portuguese(Language):
lex_attr_getters[LANG] = lambda text: 'pt'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
suffixes = TOKENIZER_SUFFIXES
infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP
stop_words = STOP_WORDS

View File

@ -1,87 +1,14 @@
# encoding: utf8
from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
from ..language_data import TOKENIZER_PREFIXES
from ..language_data import TOKENIZER_SUFFIXES
from ..language_data import TOKENIZER_INFIXES
from .. import language_data as base
from ..language_data import update_exc, strings_to_exc
from .stop_words import STOP_WORDS
TAG_MAP = {
}
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
STOP_WORDS = set(STOP_WORDS)
STOP_WORDS = set("""
à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano
anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui
aquilo area área as assim através atrás até
baixo bastante bem bom breve
cada caminho catorze cedo cento certamente certeza cima cinco coisa com como
comprido conhecido conselho contra corrente custa
da daquela daquele dar das de debaixo demais dentro depois desde desligado
dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete
dezoito dia diante direita diz dizem dizer do dois dos doze duas dão dúvida
é ela elas ele eles em embora enquanto entre então era és essa essas esse esses
esta estado estar estará estas estava este estes esteve estive estivemos
estiveram estiveste estivestes estou está estás estão eu exemplo
falta fará favor faz fazeis fazem fazemos fazer fazes fazia faço fez fim final
foi fomos for fora foram forma foste fostes fui
geral grande grandes grupo
hoje horas
iniciar inicio ir irá isso ista iste isto
lado ligado local logo longe lugar
maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus
mil minha minhas momento muito muitos máximo mês
na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome
nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós
número
obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras
outro outros
para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem
poder poderá podia ponto pontos por porque porquê posição possivelmente posso
possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam
pôde põe põem
qual qualquer quando quanto quarta quarto quatro que quem quer quero questão
quieto quinta quinto quinze quê relação
sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta
sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo
tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar
tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos
tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu
tua tuas tudo tão têm
último um uma umas uns usa usar
vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo
vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós
zero
""".split())
TOKENIZER_EXCEPTIONS = {
}
ORTH_ONLY = {
}
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]

66
spacy/pt/stop_words.py Normal file
View File

@ -0,0 +1,66 @@
# encoding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano
anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui
aquilo area área as assim através atrás até
baixo bastante bem bom breve
cada caminho catorze cedo cento certamente certeza cima cinco coisa com como
comprido conhecido conselho contra corrente custa
da daquela daquele dar das de debaixo demais dentro depois desde desligado
dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete
dezoito dia diante direita diz dizem dizer do dois dos doze duas dão dúvida
é ela elas ele eles em embora enquanto entre então era és essa essas esse esses
esta estado estar estará estas estava este estes esteve estive estivemos
estiveram estiveste estivestes estou está estás estão eu exemplo
falta fará favor faz fazeis fazem fazemos fazer fazes fazia faço fez fim final
foi fomos for fora foram forma foste fostes fui
geral grande grandes grupo
hoje horas
iniciar inicio ir irá isso ista iste isto
lado ligado local logo longe lugar
maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus
mil minha minhas momento muito muitos máximo mês
na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome
nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós
número
obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras
outro outros
para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem
poder poderá podia ponto pontos por porque porquê posição possivelmente posso
possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam
pôde põe põem
qual qualquer quando quanto quarta quarto quatro que quem quer quero questão
quieto quinta quinto quinze quê relação
sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta
sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo
tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar
tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos
tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu
tua tuas tudo tão têm
último um uma umas uns usa usar
vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo
vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós
zero
""".split())

View File

@ -1,14 +1,42 @@
from __future__ import unicode_literals
import pytest
import spacy
from ...symbols import POS, VERB, VerbForm_inf
from ...tokens import Doc
from ...vocab import Vocab
from ...lemmatizer import Lemmatizer
@pytest.mark.models
def test_not_lemmatize_base_forms():
nlp = spacy.load('en', parser=False)
doc = nlp(u"Don't feed the dog")
@pytest.fixture
def index():
return {'verb': {}}
@pytest.fixture
def exceptions():
return {'verb': {}}
@pytest.fixture
def rules():
return {"verb": [["ed", "e"]]}
@pytest.fixture
def lemmatizer(index, exceptions, rules):
return Lemmatizer(index, exceptions, rules)
@pytest.fixture
def tag_map():
return {'VB': {POS: VERB, 'morph': VerbForm_inf}}
@pytest.fixture
def vocab(lemmatizer, tag_map):
return Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
def test_not_lemmatize_base_forms(vocab):
doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"])
feed = doc[2]
feed.tag_ = u'VB'
assert feed.text == u'feed'
assert feed.lemma_ == u'feed'

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
from ...tokens import Doc
from ...vocab import Vocab
from ...attrs import POS
def test_issue600():

View File

@ -8,9 +8,9 @@ import cloudpickle
import tempfile
from ... import util
from ...en.language_data import TOKENIZER_PREFIXES as EN_TOKENIZER_PREFIXES
from ...language_data import TOKENIZER_PREFIXES
en_search_prefixes = util.compile_prefix_regex(EN_TOKENIZER_PREFIXES).search
en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search
# @pytest.mark.xfail
# def test_pickle(en_tokenizer):

View File

@ -83,6 +83,8 @@ cdef class Vocab:
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
tag_map = json.load(file_)
elif tag_map is True:
tag_map = None
if lex_attr_getters is not None \
and oov_prob is True \
and (path / 'vocab' / 'oov_prob').exists():