From a4eb5c2bff3d92a157707a52c6a45748901a1905 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 13:28:20 +0100 Subject: [PATCH 01/31] Check POS key in lemmatizer, to update it for new data format --- spacy/lemmatizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 8aee14717..a79ecb009 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -4,7 +4,7 @@ import pathlib import ujson as json -from .symbols import NOUN, VERB, ADJ, PUNCT +from .symbols import POS, NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @@ -55,7 +55,7 @@ class Lemmatizer(object): '''Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely.''' morphology = {} if morphology is None else morphology - others = [key for key in morphology if key not in ('number', 'pos', 'verbform')] + others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: return True elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: From 0595cc06357a572ef604d6c3e0b560974720524c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 13:28:51 +0100 Subject: [PATCH 02/31] Change test595 to mock data, instead of requiring model. --- spacy/tests/regression/test_issue595.py | 39 +++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index 205d62bb1..97270c54f 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -1,12 +1,41 @@ +from __future__ import unicode_literals import pytest -import spacy +from ...symbols import POS, VERB, VerbForm_inf +from ...tokens import Doc +from ...vocab import Vocab +from ...lemmatizer import Lemmatizer -@pytest.mark.models -def test_not_lemmatize_base_forms(): - nlp = spacy.load('en', parser=False) - doc = nlp(u"Don't feed the dog") +@pytest.fixture +def index(): + return {'verb': {}} + +@pytest.fixture +def exceptions(): + return {'verb': {}} + +@pytest.fixture +def rules(): + return {"verb": [["ed", "e"]]} + +@pytest.fixture +def lemmatizer(index, exceptions, rules): + return Lemmatizer(index, exceptions, rules) + + +@pytest.fixture +def tag_map(): + return {'VB': {POS: VERB, 'morph': VerbForm_inf}} + + +@pytest.fixture +def vocab(lemmatizer, tag_map): + return Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + + +def test_not_lemmatize_base_forms(vocab, lemmatizer): + doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"]) feed = doc[2] feed.tag_ = u'VB' assert feed.text == u'feed' From 28326649f31ce2627f44db438433c4d5700ed005 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 13:02:26 +0100 Subject: [PATCH 03/31] Fix typo --- spacy/en/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 8933e1f47..3daa00767 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -2154,7 +2154,7 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "'", + "''", "\")", "a.", "a.m.", From 0fc4e45cb3ca79985e3ac026ad36f52ee344ae6b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 13:03:01 +0100 Subject: [PATCH 04/31] Fix tag map for German --- spacy/de/language_data.py | 114 +++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index 0f5930587..b77ef6f37 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -9,63 +9,63 @@ from ..language_data import TOKENIZER_INFIXES TAG_MAP = { - "$(": {TAG: PUNCT, "PunctType": "brck"}, - "$,": {TAG: PUNCT, "PunctType": "comm"}, - "$.": {TAG: PUNCT, "PunctType": "peri"}, - "ADJA": {TAG: ADJ}, - "ADJD": {TAG: ADJ, "Variant": "short"}, - "ADV": {TAG: ADV}, - "APPO": {TAG: ADP, "AdpType": "post"}, - "APPR": {TAG: ADP, "AdpType": "prep"}, - "APPRART": {TAG: ADP, "AdpType": "prep", "PronType": "art"}, - "APZR": {TAG: ADP, "AdpType": "circ"}, - "ART": {TAG: DET, "PronType": "art"}, - "CARD": {TAG: NUM, "NumType": "card"}, - "FM": {TAG: X, "Foreign": "yes"}, - "ITJ": {TAG: INTJ}, - "KOKOM": {TAG: CONJ, "ConjType": "comp"}, - "KON": {TAG: CONJ}, - "KOUI": {TAG: SCONJ}, - "KOUS": {TAG: SCONJ}, - "NE": {TAG: PROPN}, - "NNE": {TAG: PROPN}, - "NN": {TAG: NOUN}, - "PAV": {TAG: ADV, "PronType": "dem"}, - "PROAV": {TAG: ADV, "PronType": "dem"}, - "PDAT": {TAG: DET, "PronType": "dem"}, - "PDS": {TAG: PRON, "PronType": "dem"}, - "PIAT": {TAG: DET, "PronType": "ind|neg|tot"}, - "PIDAT": {TAG: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, - "PIS": {TAG: PRON, "PronType": "ind|neg|tot"}, - "PPER": {TAG: PRON, "PronType": "prs"}, - "PPOSAT": {TAG: DET, "Poss": "yes", "PronType": "prs"}, - "PPOSS": {TAG: PRON, "Poss": "yes", "PronType": "prs"}, - "PRELAT": {TAG: DET, "PronType": "rel"}, - "PRELS": {TAG: PRON, "PronType": "rel"}, - "PRF": {TAG: PRON, "PronType": "prs", "Reflex": "yes"}, - "PTKA": {TAG: PART}, - "PTKANT": {TAG: PART, "PartType": "res"}, - "PTKNEG": {TAG: PART, "Negative": "yes"}, - "PTKVZ": {TAG: PART, "PartType": "vbp"}, - "PTKZU": {TAG: PART, "PartType": "inf"}, - "PWAT": {TAG: DET, "PronType": "int"}, - "PWAV": {TAG: ADV, "PronType": "int"}, - "PWS": {TAG: PRON, "PronType": "int"}, - "TRUNC": {TAG: X, "Hyph": "yes"}, - "VAFIN": {TAG: AUX, "Mood": "ind", "VerbForm": "fin"}, - "VAIMP": {TAG: AUX, "Mood": "imp", "VerbForm": "fin"}, - "VAINF": {TAG: AUX, "VerbForm": "inf"}, - "VAPP": {TAG: AUX, "Aspect": "perf", "VerbForm": "part"}, - "VMFIN": {TAG: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"}, - "VMINF": {TAG: VERB, "VerbForm": "inf", "VerbType": "mod"}, - "VMPP": {TAG: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"}, - "VVFIN": {TAG: VERB, "Mood": "ind", "VerbForm": "fin"}, - "VVIMP": {TAG: VERB, "Mood": "imp", "VerbForm": "fin"}, - "VVINF": {TAG: VERB, "VerbForm": "inf"}, - "VVIZU": {TAG: VERB, "VerbForm": "inf"}, - "VVPP": {TAG: VERB, "Aspect": "perf", "VerbForm": "part"}, - "XY": {TAG: X}, - "SP": {TAG: SPACE} + "$(": {POS: PUNCT, "PunctType": "brck"}, + "$,": {POS: PUNCT, "PunctType": "comm"}, + "$.": {POS: PUNCT, "PunctType": "peri"}, + "ADJA": {POS: ADJ}, + "ADJD": {POS: ADJ, "Variant": "short"}, + "ADV": {POS: ADV}, + "APPO": {POS: ADP, "AdpType": "post"}, + "APPR": {POS: ADP, "AdpType": "prep"}, + "APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"}, + "APZR": {POS: ADP, "AdpType": "circ"}, + "ART": {POS: DET, "PronType": "art"}, + "CARD": {POS: NUM, "NumType": "card"}, + "FM": {POS: X, "Foreign": "yes"}, + "ITJ": {POS: INTJ}, + "KOKOM": {POS: CONJ, "ConjType": "comp"}, + "KON": {POS: CONJ}, + "KOUI": {POS: SCONJ}, + "KOUS": {POS: SCONJ}, + "NE": {POS: PROPN}, + "NNE": {POS: PROPN}, + "NN": {POS: NOUN}, + "PAV": {POS: ADV, "PronType": "dem"}, + "PROAV": {POS: ADV, "PronType": "dem"}, + "PDAT": {POS: DET, "PronType": "dem"}, + "PDS": {POS: PRON, "PronType": "dem"}, + "PIAT": {POS: DET, "PronType": "ind|neg|tot"}, + "PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, + "PIS": {POS: PRON, "PronType": "ind|neg|tot"}, + "PPER": {POS: PRON, "PronType": "prs"}, + "PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"}, + "PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"}, + "PRELAT": {POS: DET, "PronType": "rel"}, + "PRELS": {POS: PRON, "PronType": "rel"}, + "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, + "PTKA": {POS: PART}, + "PTKANT": {POS: PART, "PartType": "res"}, + "PTKNEG": {POS: PART, "Negative": "yes"}, + "PTKVZ": {POS: PART, "PartType": "vbp"}, + "PTKZU": {POS: PART, "PartType": "inf"}, + "PWAT": {POS: DET, "PronType": "int"}, + "PWAV": {POS: ADV, "PronType": "int"}, + "PWS": {POS: PRON, "PronType": "int"}, + "TRUNC": {POS: X, "Hyph": "yes"}, + "VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"}, + "VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"}, + "VAINF": {POS: AUX, "VerbForm": "inf"}, + "VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"}, + "VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"}, + "VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"}, + "VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"}, + "VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"}, + "VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"}, + "VVINF": {POS: VERB, "VerbForm": "inf"}, + "VVIZU": {POS: VERB, "VerbForm": "inf"}, + "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, + "XY": {POS: X}, + "SP": {POS: SPACE} } From d5840c488bfc9d0d17f012f266860174b8a7b411 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 13:53:30 +0100 Subject: [PATCH 05/31] Clean unused code from fabfile --- fabfile.py | 163 ----------------------------------------------------- 1 file changed, 163 deletions(-) diff --git a/fabfile.py b/fabfile.py index d35ef8253..80a21146c 100644 --- a/fabfile.py +++ b/fabfile.py @@ -13,134 +13,6 @@ PWD = path.dirname(__file__) VENV_DIR = path.join(PWD, '.env') -def counts(): - pass - # Tokenize the corpus - # tokenize() - # get_freqs() - # Collate the counts - # cat freqs | sort -k2 | gather_freqs() - # gather_freqs() - # smooth() - - -# clean, make, sdist -# cd to new env, install from sdist, -# Push changes to server -# Pull changes on server -# clean make init model -# test --vectors --slow -# train -# test --vectors --slow --models -# sdist -# upload data to server -# change to clean venv -# py2: install from sdist, test --slow, download data, test --models --vectors -# py3: install from sdist, test --slow, download data, test --models --vectors - - -def prebuild(build_dir='/tmp/build_spacy'): - if file_exists(build_dir): - shutil.rmtree(build_dir) - os.mkdir(build_dir) - spacy_dir = path.dirname(__file__) - wn_url = 'http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz' - build_venv = path.join(build_dir, '.env') - with lcd(build_dir): - local('git clone %s .' % spacy_dir) - local('virtualenv ' + build_venv) - with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)): - local('pip install cython fabric fabtools pytest') - local('pip install --no-cache-dir -r requirements.txt') - local('fab clean make') - local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) - local('PYTHONPATH=`pwd` python bin/init_model.py en lang_data corpora spacy/en/data') - local('PYTHONPATH=`pwd` fab test') - local('PYTHONPATH=`pwd` python -m spacy.en.download --force all') - local('PYTHONPATH=`pwd` py.test --models spacy/tests/') - - -def web(): - def jade(source_name, out_dir): - pwd = path.join(path.dirname(__file__), 'website') - jade_loc = path.join(pwd, 'src', 'jade', source_name) - out_loc = path.join(pwd, 'site', out_dir) - local('jade -P %s --out %s' % (jade_loc, out_loc)) - - with virtualenv(VENV_DIR): - local('./website/create_code_samples spacy/tests/website/ website/src/code/') - - jade('404.jade', '') - jade('home/index.jade', '') - jade('docs/index.jade', 'docs/') - jade('blog/index.jade', 'blog/') - - for collection in ('blog', 'tutorials'): - for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir(): - if post_dir.is_dir() \ - and (post_dir / 'index.jade').exists() \ - and (post_dir / 'meta.jade').exists(): - jade(str(post_dir / 'index.jade'), path.join(collection, post_dir.parts[-1])) - - -def web_publish(assets_path): - from boto.s3.connection import S3Connection, OrdinaryCallingFormat - - site_path = 'website/site' - - os.environ['S3_USE_SIGV4'] = 'True' - conn = S3Connection(host='s3.eu-central-1.amazonaws.com', - calling_format=OrdinaryCallingFormat()) - bucket = conn.get_bucket('spacy.io', validate=False) - - keys_left = set([k.name for k in bucket.list() - if not k.name.startswith('resources')]) - - for root, dirnames, filenames in os.walk(site_path): - for dirname in dirnames: - target = os.path.relpath(os.path.join(root, dirname), site_path) - source = os.path.join(target, 'index.html') - - if os.path.exists(os.path.join(root, dirname, 'index.html')): - key = bucket.new_key(source) - key.set_redirect('//%s/%s' % (bucket.name, target)) - print('adding redirect for %s' % target) - - keys_left.remove(source) - - for filename in filenames: - source = os.path.join(root, filename) - - target = os.path.relpath(root, site_path) - if target == '.': - target = filename - elif filename != 'index.html': - target = os.path.join(target, filename) - - key = bucket.new_key(target) - key.set_metadata('Content-Type', 'text/html') - key.set_contents_from_filename(source) - print('uploading %s' % target) - - keys_left.remove(target) - - for key_name in keys_left: - print('deleting %s' % key_name) - bucket.delete_key(key_name) - - local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path) - - -def publish(version): - with virtualenv(VENV_DIR): - local('git push origin master') - local('git tag -a %s' % version) - local('git push origin %s' % version) - local('python setup.py sdist') - local('python setup.py register') - local('twine upload dist/spacy-%s.tar.gz' % version) - - def env(lang="python2.7"): if file_exists('.env'): local('rm -rf .env') @@ -172,38 +44,3 @@ def test(): with virtualenv(VENV_DIR): with lcd(path.dirname(__file__)): local('py.test -x spacy/tests') - - -def train(json_dir=None, dev_loc=None, model_dir=None): - if json_dir is None: - json_dir = 'corpora/en/json' - if model_dir is None: - model_dir = 'models/en/' - with virtualenv(VENV_DIR): - with lcd(path.dirname(__file__)): - local('python bin/init_model.py en lang_data/ corpora/ ' + model_dir) - local('python bin/parser/train.py -p en %s/train/ %s/development %s' % (json_dir, json_dir, model_dir)) - - -def travis(): - local('open https://travis-ci.org/honnibal/thinc') - - -def pos(): - with virtualenv(VENV_DIR): - local('python tools/train.py ~/work_data/docparse/wsj02-21.conll ~/work_data/docparse/wsj22.conll spacy/en/data') - local('python tools/tag.py ~/work_data/docparse/wsj22.raw /tmp/tmp') - local('python tools/eval_pos.py ~/work_data/docparse/wsj22.conll /tmp/tmp') - - -def ner(): - local('rm -rf data/en/ner') - local('python tools/train_ner.py ~/work_data/docparse/wsj02-21.conll data/en/ner') - local('python tools/tag_ner.py ~/work_data/docparse/wsj22.raw /tmp/tmp') - local('python tools/eval_ner.py ~/work_data/docparse/wsj22.conll /tmp/tmp | tail') - - -def conll(): - local('rm -rf data/en/ner') - local('python tools/conll03_train.py ~/work_data/ner/conll2003/eng.train data/en/ner/') - local('python tools/conll03_eval.py ~/work_data/ner/conll2003/eng.testa') From 46e98ec029d82ff2af970f507ab376cbd840858e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 14:03:40 +0100 Subject: [PATCH 06/31] Move init_model.py script from repo. These meta-tools should live elsewhere --- bin/init_model.py | 229 ---------------------------------------------- 1 file changed, 229 deletions(-) delete mode 100644 bin/init_model.py diff --git a/bin/init_model.py b/bin/init_model.py deleted file mode 100644 index be86cd17a..000000000 --- a/bin/init_model.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Set up a model directory. - -Requires: - - lang_data --- Rules for the tokenizer - * prefix.txt - * suffix.txt - * infix.txt - * morphs.json - * specials.json - - corpora --- Data files - * WordNet - * words.sgt.prob --- Smoothed unigram probabilities - * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters - * vectors.bz2 --- output of something like word2vec, compressed with bzip -""" -from __future__ import unicode_literals - -from ast import literal_eval -import math -import gzip -import json - -import plac -from pathlib import Path - -from shutil import copyfile -from shutil import copytree -from collections import defaultdict -import io - -from spacy.vocab import Vocab -from spacy.vocab import write_binary_vectors -from spacy.strings import hash_string -from preshed.counter import PreshCounter - -from spacy.parts_of_speech import NOUN, VERB, ADJ -from spacy.util import get_lang_class - - -try: - unicode -except NameError: - unicode = str - - -def setup_tokenizer(lang_data_dir, tok_dir): - if not tok_dir.exists(): - tok_dir.mkdir() - - for filename in ('infix.txt', 'morphs.json', 'prefix.txt', 'specials.json', - 'suffix.txt'): - src = lang_data_dir / filename - dst = tok_dir / filename - copyfile(str(src), str(dst)) - - -def _read_clusters(loc): - if not loc.exists(): - print("Warning: Clusters file not found") - return {} - clusters = {} - for line in io.open(str(loc), 'r', encoding='utf8'): - try: - cluster, word, freq = line.split() - except ValueError: - continue - # If the clusterer has only seen the word a few times, its cluster is - # unreliable. - if int(freq) >= 3: - clusters[word] = cluster - else: - clusters[word] = '0' - # Expand clusters with re-casing - for word, cluster in list(clusters.items()): - if word.lower() not in clusters: - clusters[word.lower()] = cluster - if word.title() not in clusters: - clusters[word.title()] = cluster - if word.upper() not in clusters: - clusters[word.upper()] = cluster - return clusters - - -def _read_probs(loc): - if not loc.exists(): - print("Probabilities file not found. Trying freqs.") - return {}, 0.0 - probs = {} - for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')): - prob, word = line.split() - prob = float(prob) - probs[word] = prob - return probs, probs['-OOV-'] - - -def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): - if not loc.exists(): - print("Warning: Frequencies file not found") - return {}, 0.0 - counts = PreshCounter() - total = 0 - if str(loc).endswith('gz'): - file_ = gzip.open(str(loc)) - else: - file_ = loc.open() - for i, line in enumerate(file_): - freq, doc_freq, key = line.rstrip().split('\t', 2) - freq = int(freq) - counts.inc(i+1, freq) - total += freq - counts.smooth() - log_total = math.log(total) - if str(loc).endswith('gz'): - file_ = gzip.open(str(loc)) - else: - file_ = loc.open() - probs = {} - for line in file_: - freq, doc_freq, key = line.rstrip().split('\t', 2) - doc_freq = int(doc_freq) - freq = int(freq) - if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - word = literal_eval(key) - smooth_count = counts.smoother(int(freq)) - probs[word] = math.log(smooth_count) - log_total - oov_prob = math.log(counts.smoother(0)) - log_total - return probs, oov_prob - - -def _read_senses(loc): - lexicon = defaultdict(lambda: defaultdict(list)) - if not loc.exists(): - print("Warning: WordNet senses not found") - return lexicon - sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS)) - pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ} - for line in codecs.open(str(loc), 'r', 'utf8'): - sense_strings = line.split() - word = sense_strings.pop(0) - for sense in sense_strings: - pos, sense = sense[3:].split('.') - sense_name = '%s_%s' % (pos[0].upper(), sense.lower()) - if sense_name != 'N_tops': - sense_id = sense_names[sense_name] - lexicon[word][pos_ids[pos]].append(sense_id) - return lexicon - - -def setup_vocab(lex_attr_getters, tag_map, src_dir, dst_dir): - if not dst_dir.exists(): - dst_dir.mkdir() - print('Reading vocab from ', src_dir) - vectors_src = src_dir / 'vectors.bz2' - if vectors_src.exists(): - write_binary_vectors(vectors_src.as_posix(), (dst_dir / 'vec.bin').as_posix()) - else: - print("Warning: Word vectors file not found") - vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map) - clusters = _read_clusters(src_dir / 'clusters.txt') - probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') - if not probs: - probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz') - if not probs: - oov_prob = -20 - else: - oov_prob = min(probs.values()) - for word in clusters: - if word not in probs: - probs[word] = oov_prob - - lexicon = [] - for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - # First encode the strings into the StringStore. This way, we can map - # the orth IDs to frequency ranks - orth = vocab.strings[word] - # Now actually load the vocab - for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - lexeme = vocab[word] - lexeme.prob = prob - lexeme.is_oov = False - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - lexeme.cluster = int(clusters[word][::-1], 2) - else: - lexeme.cluster = 0 - vocab.dump((dst_dir / 'lexemes.bin').as_posix()) - with (dst_dir / 'strings.json').open('w') as file_: - vocab.strings.dump(file_) - with (dst_dir / 'oov_prob').open('w') as file_: - file_.write('%f' % oov_prob) - - -def main(lang_id, lang_data_dir, corpora_dir, model_dir): - model_dir = Path(model_dir) - lang_data_dir = Path(lang_data_dir) / lang_id - corpora_dir = Path(corpora_dir) / lang_id - - assert corpora_dir.exists() - assert lang_data_dir.exists() - - if not model_dir.exists(): - model_dir.mkdir() - - tag_map = json.load((lang_data_dir / 'tag_map.json').open()) - setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(get_lang_class(lang_id).Defaults.lex_attr_getters, tag_map, corpora_dir, - model_dir / 'vocab') - - if (lang_data_dir / 'gazetteer.json').exists(): - copyfile((lang_data_dir / 'gazetteer.json').as_posix(), - (model_dir / 'vocab' / 'gazetteer.json').as_posix()) - - copyfile((lang_data_dir / 'tag_map.json').as_posix(), - (model_dir / 'vocab' / 'tag_map.json').as_posix()) - - if (lang_data_dir / 'lemma_rules.json').exists(): - copyfile((lang_data_dir / 'lemma_rules.json').as_posix(), - (model_dir / 'vocab' / 'lemma_rules.json').as_posix()) - - if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): - copytree((corpora_dir / 'wordnet' / 'dict').as_posix(), - (model_dir / 'wordnet').as_posix()) - - -if __name__ == '__main__': - plac.call(main) From 121c310566da7e3853d76afbac102154975fa4e5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 13:33:24 +0100 Subject: [PATCH 07/31] Remove trailing whitespace --- spacy/tests/regression/test_issue595.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index 97270c54f..ef4dc1a36 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -40,4 +40,3 @@ def test_not_lemmatize_base_forms(vocab, lemmatizer): feed.tag_ = u'VB' assert feed.text == u'feed' assert feed.lemma_ == u'feed' - From 77cf2fb0f63a5520de3b8b3456ce4c9181b91d16 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 13:33:40 +0100 Subject: [PATCH 08/31] Remove unnecessary argument in test --- spacy/tests/regression/test_issue595.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index ef4dc1a36..1f0ed3a3c 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -34,7 +34,7 @@ def vocab(lemmatizer, tag_map): return Vocab(lemmatizer=lemmatizer, tag_map=tag_map) -def test_not_lemmatize_base_forms(vocab, lemmatizer): +def test_not_lemmatize_base_forms(vocab): doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"]) feed = doc[2] feed.tag_ = u'VB' From 57c4341453b4fe0b7056d5e1b7f4e54d396cd2ea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 14:59:44 +0100 Subject: [PATCH 09/31] Refactor loading of morphology exceptions, adding a method add_special_case. --- spacy/morphology.pyx | 62 ++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index a9c785d3a..fbcbc2e66 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,4 +1,7 @@ from os import path + +from libc.string cimport memset + from .lemmatizer import Lemmatizer try: @@ -85,35 +88,44 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: pass + def add_special_case(self, unicode tag_str, unicode orth_str, props, force=False): + '''Add a special-case rule to the morphological analyser. Tokens whose + tag and orth match the rule will receive the specified properties. + + Arguments: + tag (unicode): The part-of-speech tag to key the exception. + orth (unicode): The word-form to key the exception. + ''' + tag = self.strings[tag_str] + tag_id = self.reverse_index[tag] + orth = self.strings[orth_str] + rich_tag = self.rich_tags[tag_id] + props = _normalize_props(props) + + cached = self._cache.get(tag_id, orth) + if cached is NULL: + cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) + elif force: + memset(cached, 0, sizeof(cached)) + else: + msg = ("Conflicting morphology exception for (%s, %s). Use force=True " + "to overwrite.") + msg = msg % (tag_str, orth_str) + raise ValueError(msg) + + cached.tag = rich_tag + for name_str, value_str in props.items(): + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth, + self.tag_map.get(tag_str, {})) + self._cache.set(tag_id, orth, cached) + def load_morph_exceptions(self, dict exc): # Map (form, pos) to (lemma, rich tag) - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef attr_t tag_id - cdef int pos - cdef RichTagC rich_tag for tag_str, entries in exc.items(): - tag = self.strings[tag_str] - tag_id = self.reverse_index[tag] - rich_tag = self.rich_tags[tag_id] for form_str, props in entries.items(): - cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached.tag = rich_tag - orth = self.strings[form_str] - for name_str, value_str in props.items(): - if name_str == 'L': - cached.lemma = self.strings[value_str] - else: - self.assign_feature(&cached.tag.morph, name_str, value_str) - if cached.lemma == 0: - cached.lemma = self.lemmatize(rich_tag.pos, orth, - self.tag_map.get(tag_str, {})) - self._cache.set(tag_id, orth, cached) + self.add_special_case(tag_str, form_str, props) def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): cdef unicode py_string = self.strings[orth] From 62655fd36fd13a4bcad3cfd09b853126fa2b5a27 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:34:21 +0100 Subject: [PATCH 10/31] Add ENT_ID constant --- spacy/language_data/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language_data/util.py b/spacy/language_data/util.py index dceee1908..71365d879 100644 --- a/spacy/language_data/util.py +++ b/spacy/language_data/util.py @@ -5,6 +5,7 @@ from ..symbols import * PRON_LEMMA = "-PRON-" +ENT_ID = "ent_id" def update_exc(exc, additions): From 704c7442e010d28d9d6f1c3173a90e4e970d13de Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:35:36 +0100 Subject: [PATCH 11/31] Break language data components into their own files --- spacy/en/stop_words.py | 67 ++++++++ spacy/en/tag_map.py | 64 ++++++++ ...nguage_data.py => tokenizer_exceptions.py} | 153 ------------------ 3 files changed, 131 insertions(+), 153 deletions(-) create mode 100644 spacy/en/stop_words.py create mode 100644 spacy/en/tag_map.py rename spacy/en/{language_data.py => tokenizer_exceptions.py} (88%) diff --git a/spacy/en/stop_words.py b/spacy/en/stop_words.py new file mode 100644 index 000000000..1b00eb974 --- /dev/null +++ b/spacy/en/stop_words.py @@ -0,0 +1,67 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a about above across after afterwards again against all almost alone along +already also although always am among amongst amount an and another any anyhow +anyone anything anyway anywhere are around as at + +back be became because become becomes becoming been before beforehand behind +being below beside besides between beyond both bottom but by + +call can cannot ca could + +did do does doing done down due during + +each eight either eleven else elsewhere empty enough etc even ever every +everyone everything everywhere except + +few fifteen fifty first five for former formerly forty four from front full +further + +get give go + +had has have he hence her here hereafter hereby herein hereupon hers herself +him himself his how however hundred + +i if in inc indeed into is it its itself + +keep + +last latter latterly least less + +just + +made make many may me meanwhile might mine more moreover most mostly move much +must my myself + +name namely neither never nevertheless next nine no nobody none noone nor not +nothing now nowhere + +of off often on once one only onto or other others otherwise our ours ourselves +out over own + +part per perhaps please put + +quite + +rather re really regarding + +same say see seem seemed seeming seems serious several she should show side +since six sixty so some somehow someone something sometime sometimes somewhere +still such + +take ten than that the their them themselves then thence there thereafter +thereby therefore therein thereupon these they third this those though three +through throughout thru thus to together too top toward towards twelve twenty +two + +under until up unless upon us used using + +various very very via was we well were what whatever when whence whenever where +whereafter whereas whereby wherein whereupon wherever whether which while +whither who whoever whole whom whose why will with within without would + +yet you your yours yourself yourselves +""".split()) diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py new file mode 100644 index 000000000..7a3589d0e --- /dev/null +++ b/spacy/en/tag_map.py @@ -0,0 +1,64 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "$": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "SP": {POS: SPACE}, + "ADD": {POS: X}, + "NFP": {POS: PUNCT}, + "GW": {POS: X}, + "XX": {POS: X}, + "BES": {POS: VERB}, + "HVS": {POS: VERB} +} diff --git a/spacy/en/language_data.py b/spacy/en/tokenizer_exceptions.py similarity index 88% rename from spacy/en/language_data.py rename to spacy/en/tokenizer_exceptions.py index 3daa00767..56cc1d7fa 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/tokenizer_exceptions.py @@ -3,159 +3,6 @@ from __future__ import unicode_literals from ..symbols import * from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES - - -def get_time_exc(hours): - exc = {} - for hour in hours: - exc["%da.m." % hour] = [ - {ORTH: hour}, - {ORTH: "a.m."} - ] - - exc["%dp.m." % hour] = [ - {ORTH: hour}, - {ORTH: "p.m."} - ] - - exc["%dam" % hour] = [ - {ORTH: hour}, - {ORTH: "am", LEMMA: "a.m."} - ] - - exc["%dpm" % hour] = [ - {ORTH: hour}, - {ORTH: "pm", LEMMA: "p.m."} - ] - return exc - - -TAG_MAP = { - ".": {POS: PUNCT, "PunctType": "peri"}, - ",": {POS: PUNCT, "PunctType": "comm"}, - "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, - "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, - "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, - "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, - "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CONJ, "ConjType": "coor"}, - "CD": {POS: NUM, "NumType": "card"}, - "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, - "FW": {POS: X, "Foreign": "yes"}, - "HYPH": {POS: PUNCT, "PunctType": "dash"}, - "IN": {POS: ADP}, - "JJ": {POS: ADJ, "Degree": "pos"}, - "JJR": {POS: ADJ, "Degree": "comp"}, - "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, - "NIL": {POS: ""}, - "NN": {POS: NOUN, "Number": "sing"}, - "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, - "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, - "NNS": {POS: NOUN, "Number": "plur"}, - "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, - "POS": {POS: PART, "Poss": "yes"}, - "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, - "RB": {POS: ADV, "Degree": "pos"}, - "RBR": {POS: ADV, "Degree": "comp"}, - "RBS": {POS: ADV, "Degree": "sup"}, - "RP": {POS: PART}, - "SYM": {POS: SYM}, - "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, - "UH": {POS: INTJ}, - "VB": {POS: VERB, "VerbForm": "inf"}, - "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, - "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, - "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, - "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, - "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, - "WDT": {POS: ADJ, "PronType": "int|rel"}, - "WP": {POS: NOUN, "PronType": "int|rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, - "ADD": {POS: X}, - "NFP": {POS: PUNCT}, - "GW": {POS: X}, - "XX": {POS: X}, - "BES": {POS: VERB}, - "HVS": {POS: VERB} -} - - -STOP_WORDS = set(""" -a about above across after afterwards again against all almost alone along -already also although always am among amongst amount an and another any anyhow -anyone anything anyway anywhere are around as at - -back be became because become becomes becoming been before beforehand behind -being below beside besides between beyond both bottom but by - -call can cannot ca could - -did do does doing done down due during - -each eight either eleven else elsewhere empty enough etc even ever every -everyone everything everywhere except - -few fifteen fifty first five for former formerly forty four from front full -further - -get give go - -had has have he hence her here hereafter hereby herein hereupon hers herself -him himself his how however hundred - -i if in inc indeed into is it its itself - -keep - -last latter latterly least less - -just - -made make many may me meanwhile might mine more moreover most mostly move much -must my myself - -name namely neither never nevertheless next nine no nobody none noone nor not -nothing now nowhere - -of off often on once one only onto or other others otherwise our ours ourselves -out over own - -part per perhaps please put - -quite - -rather re really regarding - -same say see seem seemed seeming seems serious several she should show side -since six sixty so some somehow someone something sometime sometimes somewhere -still such - -take ten than that the their them themselves then thence there thereafter -thereby therefore therein thereupon these they third this those though three -through throughout thru thus to together too top toward towards twelve twenty -two - -under until up unless upon us used using - -various very very via was we well were what whatever when whence whenever where -whereafter whereas whereby wherein whereupon wherever whether which while -whither who whoever whole whom whose why will with within without would - -yet you your yours yourself yourselves -""".split()) TOKENIZER_EXCEPTIONS = { From eaa3b1319da2851dc104e566dd232aeeac7d2cd7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:35:56 +0100 Subject: [PATCH 12/31] Fix formatting --- spacy/language_data/punctuation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 65d7f95e8..fb784271e 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -130,4 +130,4 @@ TOKENIZER_INFIXES = r''' '''.strip().split('\n') -__all__ = [ "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES" ] +__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] From bc40dad7d9fcf4feeca9af6187788f20303c1e74 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:36:09 +0100 Subject: [PATCH 13/31] Add entity rules --- spacy/language_data/__init__.py | 1 + spacy/language_data/entity_rules.py | 206 ++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 spacy/language_data/entity_rules.py diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index c8109a51e..3aba785c2 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,3 +1,4 @@ from .emoticons import * from .punctuation import * +from .entity_rules import * from .util import * diff --git a/spacy/language_data/entity_rules.py b/spacy/language_data/entity_rules.py new file mode 100644 index 000000000..4217ecfcf --- /dev/null +++ b/spacy/language_data/entity_rules.py @@ -0,0 +1,206 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from .util import ENT_ID + + +ENTITY_RULES = [ + { + ENT_ID: "Reddit", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "reddit"}] + ] + }, + + { + ENT_ID: "Linux", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "linux"}] + ] + }, + + { + ENT_ID: "Haskell", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "haskell"}], + ] + }, + + { + ENT_ID: "HaskellCurry", + "attrs": {ENT_TYPE: "PERSON"}, + "patterns": [ + [{LOWER: "haskell"}, {LOWER: "curry"}] + ] + }, + + { + ENT_ID: "Javascript", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "javascript"}], + ] + }, + + { + ENT_ID: "CSS", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "css"}], + [{LOWER: "css3"}], + ] + }, + + { + ENT_ID: "HTML", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "html"}], + [{LOWER: "html5"}], + ] + }, + + { + ENT_ID: "Python", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Python"}] + ] + }, + + { + ENT_ID: "Ruby", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Ruby"}] + ] + }, + + { + ENT_ID: "spaCy", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "spacy"}] + ] + }, + + { + ENT_ID: "displaCy", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "displacy"}] + ] + }, + + { + ENT_ID: "Digg", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "digg"}] + ] + }, + + { + ENT_ID: "FoxNews", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "foxnews"}], + [{LOWER: "fox"}, {LOWER: "news"}] + ] + }, + + { + ENT_ID: "Google", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "google"}] + ] + }, + + { + ENT_ID: "Mac", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "mac"}] + ] + }, + + { + ENT_ID: "Wikipedia", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "wikipedia"}] + ] + }, + + { + ENT_ID: "Windows", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Windows"}] + ] + }, + + { + ENT_ID: "Dell", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "dell"}] + ] + }, + + { + ENT_ID: "Facebook", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "facebook"}] + ] + }, + + { + ENT_ID: "Blizzard", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{ORTH: "Blizzard"}] + ] + }, + + { + ENT_ID: "Ubuntu", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{ORTH: "Ubuntu"}] + ] + }, + + { + ENT_ID: "YouTube", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "youtube"}] + ] + } +] + + +FALSE_POSITIVES = [ + [{ORTH: "Shit"}], + [{ORTH: "Weed"}], + [{ORTH: "Cool"}], + [{ORTH: "Btw"}], + [{ORTH: "Bah"}], + [{ORTH: "Bullshit"}], + [{ORTH: "Lol"}], + [{ORTH: "Yo"}, {LOWER: "dawg"}], + [{ORTH: "Yay"}], + [{ORTH: "Ahh"}], + [{ORTH: "Yea"}], + [{ORTH: "Bah"}] +] + + +__all__ = ["ENTITY_RULES", "FALSE_POSITIVES"] From 29ad8143d877b9a1ce505bfd1ca391487c9dc24e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:36:15 +0100 Subject: [PATCH 14/31] Add morph rules --- spacy/en/morph_rules.py | 67 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 spacy/en/morph_rules.py diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py new file mode 100644 index 000000000..ab5c2b6a0 --- /dev/null +++ b/spacy/en/morph_rules.py @@ -0,0 +1,67 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +MORPH_RULES = { + "PRP": { + "I": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "me": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "you": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, + "he": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "him": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "she": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "her": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "it": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "we": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "us": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "they": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + + "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, + "yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, + "his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, + "hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, + "its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, + "ours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + "yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + "theirs": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, + + "myself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"}, + "yourself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"}, + "himself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"}, + "herself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem", "Reflex": "Yes"}, + "itself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"}, + "themself": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"}, + "ourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"}, + "yourselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"}, + "themselves": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc", "Reflex": "Yes"} + }, + + "PRP$": { + "my": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"}, + "your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"}, + "his": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"}, + "her": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"}, + "its": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"}, + "our": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}, + "their": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"} + }, + + "VBZ": { + "am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, + "are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, + "is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, + }, + + "VBP": { + "are": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} + }, + + "VBD": { + "was": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, + "were": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} + } +} From 2eb163c5dd675c2e7a9cedb5d6868545833cbf34 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:36:20 +0100 Subject: [PATCH 15/31] Add lemma rules --- spacy/en/lemma_rules.py | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 spacy/en/lemma_rules.py diff --git a/spacy/en/lemma_rules.py b/spacy/en/lemma_rules.py new file mode 100644 index 000000000..194712c24 --- /dev/null +++ b/spacy/en/lemma_rules.py @@ -0,0 +1,42 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +LEMMA_RULES = { + "noun": [ + ["s", ""], + ["ses", "s"], + ["ves", "f"], + ["xes", "x"], + ["zes", "z"], + ["ches", "ch"], + ["shes", "sh"], + ["men", "man"], + ["ies", "y"] + ], + + "verb": [ + ["s", ""], + ["ies", "y"], + ["es", "e"], + ["es", ""], + ["ed", "e"], + ["ed", ""], + ["ing", "e"], + ["ing", ""] + ], + + "adj": [ + ["er", ""], + ["est", ""], + ["er", "e"], + ["est", "e"] + ], + + "punct": [ + ["“", "\""], + ["”", "\""], + ["\u2018", "'"], + ["\u2019", "'"] + ] +} From 1bff59a8db2b9d39b213aed113c9b18e001943b0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:36:50 +0100 Subject: [PATCH 16/31] Update English language data --- spacy/en/language_data.py | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 spacy/en/language_data.py diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py new file mode 100644 index 000000000..1b0ba47df --- /dev/null +++ b/spacy/en/language_data.py @@ -0,0 +1,41 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA +from ..language_data import ENT_ID +from ..language_data import TOKENIZER_PREFIXES +from ..language_data import TOKENIZER_SUFFIXES +from ..language_data import TOKENIZER_INFIXES +from ..language_data import ENTITY_RULES, FALSE_POSITIVES + +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY +from .lemma_rules import LEMMA_RULES +from .morph_rules import MORPH_RULES + + +def get_time_exc(hours): + exc = {} + for hour in hours: + exc["%da.m." % hour] = [ + {ORTH: hour}, + {ORTH: "a.m."} + ] + + exc["%dp.m." % hour] = [ + {ORTH: hour}, + {ORTH: "p.m."} + ] + + exc["%dam" % hour] = [ + {ORTH: hour}, + {ORTH: "am", LEMMA: "a.m."} + ] + + exc["%dpm" % hour] = [ + {ORTH: hour}, + {ORTH: "pm", LEMMA: "p.m."} + ] + return exc From 32b36c38824c50f9ee2d6ac015bb37a7a8a52287 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:40:22 +0100 Subject: [PATCH 17/31] Break language data components into their own files --- spacy/de/language_data.py | 766 +------------------------------ spacy/de/stop_words.py | 81 ++++ spacy/de/tag_map.py | 65 +++ spacy/de/tokenizer_exceptions.py | 629 +++++++++++++++++++++++++ 4 files changed, 778 insertions(+), 763 deletions(-) create mode 100644 spacy/de/stop_words.py create mode 100644 spacy/de/tag_map.py create mode 100644 spacy/de/tokenizer_exceptions.py diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index b77ef6f37..4fefdb6a6 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -7,766 +7,6 @@ from ..language_data import TOKENIZER_PREFIXES from ..language_data import TOKENIZER_SUFFIXES from ..language_data import TOKENIZER_INFIXES - -TAG_MAP = { - "$(": {POS: PUNCT, "PunctType": "brck"}, - "$,": {POS: PUNCT, "PunctType": "comm"}, - "$.": {POS: PUNCT, "PunctType": "peri"}, - "ADJA": {POS: ADJ}, - "ADJD": {POS: ADJ, "Variant": "short"}, - "ADV": {POS: ADV}, - "APPO": {POS: ADP, "AdpType": "post"}, - "APPR": {POS: ADP, "AdpType": "prep"}, - "APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"}, - "APZR": {POS: ADP, "AdpType": "circ"}, - "ART": {POS: DET, "PronType": "art"}, - "CARD": {POS: NUM, "NumType": "card"}, - "FM": {POS: X, "Foreign": "yes"}, - "ITJ": {POS: INTJ}, - "KOKOM": {POS: CONJ, "ConjType": "comp"}, - "KON": {POS: CONJ}, - "KOUI": {POS: SCONJ}, - "KOUS": {POS: SCONJ}, - "NE": {POS: PROPN}, - "NNE": {POS: PROPN}, - "NN": {POS: NOUN}, - "PAV": {POS: ADV, "PronType": "dem"}, - "PROAV": {POS: ADV, "PronType": "dem"}, - "PDAT": {POS: DET, "PronType": "dem"}, - "PDS": {POS: PRON, "PronType": "dem"}, - "PIAT": {POS: DET, "PronType": "ind|neg|tot"}, - "PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, - "PIS": {POS: PRON, "PronType": "ind|neg|tot"}, - "PPER": {POS: PRON, "PronType": "prs"}, - "PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"}, - "PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"}, - "PRELAT": {POS: DET, "PronType": "rel"}, - "PRELS": {POS: PRON, "PronType": "rel"}, - "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, - "PTKA": {POS: PART}, - "PTKANT": {POS: PART, "PartType": "res"}, - "PTKNEG": {POS: PART, "Negative": "yes"}, - "PTKVZ": {POS: PART, "PartType": "vbp"}, - "PTKZU": {POS: PART, "PartType": "inf"}, - "PWAT": {POS: DET, "PronType": "int"}, - "PWAV": {POS: ADV, "PronType": "int"}, - "PWS": {POS: PRON, "PronType": "int"}, - "TRUNC": {POS: X, "Hyph": "yes"}, - "VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"}, - "VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"}, - "VAINF": {POS: AUX, "VerbForm": "inf"}, - "VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"}, - "VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"}, - "VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"}, - "VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"}, - "VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"}, - "VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"}, - "VVINF": {POS: VERB, "VerbForm": "inf"}, - "VVIZU": {POS: VERB, "VerbForm": "inf"}, - "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, - "XY": {POS: X}, - "SP": {POS: SPACE} -} - - -STOP_WORDS = set(""" -á a ab aber ach acht achte achten achter achtes ag alle allein allem allen -aller allerdings alles allgemeinen als also am an andere anderen andern anders -auch auf aus ausser außer ausserdem außerdem - -bald bei beide beiden beim beispiel bekannt bereits besonders besser besten bin -bis bisher bist - -da dabei dadurch dafür dagegen daher dahin dahinter damals damit danach daneben -dank dann daran darauf daraus darf darfst darin darüber darum darunter das -dasein daselbst dass daß dasselbe davon davor dazu dazwischen dein deine deinem -deiner dem dementsprechend demgegenüber demgemäss demgemäß demselben demzufolge -den denen denn denselben der deren derjenige derjenigen dermassen dermaßen -derselbe derselben des deshalb desselben dessen deswegen dich die diejenige -diejenigen dies diese dieselbe dieselben diesem diesen dieser dieses dir doch -dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft -durfte durften - -eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine -einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en -ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch - -früher fünf fünfte fünften fünfter fünftes für - -gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen -geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige -gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen -großen grosser großer grosses großes gut gute guter gutes - -habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier -hin hinter hoch - -ich ihm ihn ihnen ihr ihre ihrem ihrer ihres im immer in indem infolgedessen -ins irgend ist - -ja jahr jahre jahren je jede jedem jeden jeder jedermann jedermanns jedoch -jemand jemandem jemanden jene jenem jenen jener jenes jetzt - -kam kann kannst kaum kein keine keinem keinen keiner kleine kleinen kleiner -kleines kommen kommt können könnt konnte könnte konnten kurz - -lang lange leicht leider lieber los - -machen macht machte mag magst man manche manchem manchen mancher manches mehr -mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel -mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst -musste mussten - -na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter -neuntes nicht nichts nie niemand niemandem niemanden noch nun nur - -ob oben oder offen oft ohne - -recht rechte rechten rechter rechtes richtig rund - -sagt sagte sah satt schlecht schon sechs sechste sechsten sechster sechstes -sehr sei seid seien sein seine seinem seinen seiner seines seit seitdem selbst -selbst sich sie sieben siebente siebenten siebenter siebentes siebte siebten -siebter siebtes sind so solang solche solchem solchen solcher solches soll -sollen sollte sollten sondern sonst sowie später statt - -tag tage tagen tat teil tel trotzdem tun - -über überhaupt übrigens uhr um und uns unser unsere unserer unter - -vergangene vergangenen viel viele vielem vielen vielleicht vier vierte vierten -vierter viertes vom von vor - -wahr während währenddem währenddessen wann war wäre waren wart warum was wegen -weil weit weiter weitere weiteren weiteres welche welchem welchen welcher -welches wem wen wenig wenige weniger weniges wenigstens wenn wer werde werden -werdet wessen wie wieder will willst wir wird wirklich wirst wo wohl wollen -wollt wollte wollten worden wurde würde wurden würden - -zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur -zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen -""".split()) - - -TOKENIZER_EXCEPTIONS = { - "\\n": [ - {ORTH: "\\n", LEMMA: "", TAG: "SP"} - ], - - "\\t": [ - {ORTH: "\\t", LEMMA: "", TAG: "SP"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: PRON_LEMMA} - ], - - "'n": [ - {ORTH: "'n", LEMMA: "ein"} - ], - - "'ne": [ - {ORTH: "'ne", LEMMA: "eine"} - ], - - "'nen": [ - {ORTH: "'nen", LEMMA: "einen"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "Abb.": [ - {ORTH: "Abb.", LEMMA: "Abbildung"} - ], - - "Abk.": [ - {ORTH: "Abk.", LEMMA: "Abkürzung"} - ], - - "Abt.": [ - {ORTH: "Abt.", LEMMA: "Abteilung"} - ], - - "Apr.": [ - {ORTH: "Apr.", LEMMA: "April"} - ], - - "Aug.": [ - {ORTH: "Aug.", LEMMA: "August"} - ], - - "Bd.": [ - {ORTH: "Bd.", LEMMA: "Band"} - ], - - "Betr.": [ - {ORTH: "Betr.", LEMMA: "Betreff"} - ], - - "Bf.": [ - {ORTH: "Bf.", LEMMA: "Bahnhof"} - ], - - "Bhf.": [ - {ORTH: "Bhf.", LEMMA: "Bahnhof"} - ], - - "Bsp.": [ - {ORTH: "Bsp.", LEMMA: "Beispiel"} - ], - - "Dez.": [ - {ORTH: "Dez.", LEMMA: "Dezember"} - ], - - "Di.": [ - {ORTH: "Di.", LEMMA: "Dienstag"} - ], - - "Do.": [ - {ORTH: "Do.", LEMMA: "Donnerstag"} - ], - - "Fa.": [ - {ORTH: "Fa.", LEMMA: "Firma"} - ], - - "Fam.": [ - {ORTH: "Fam.", LEMMA: "Familie"} - ], - - "Feb.": [ - {ORTH: "Feb.", LEMMA: "Februar"} - ], - - "Fr.": [ - {ORTH: "Fr.", LEMMA: "Frau"} - ], - - "Frl.": [ - {ORTH: "Frl.", LEMMA: "Fräulein"} - ], - - "Hbf.": [ - {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"} - ], - - "Hr.": [ - {ORTH: "Hr.", LEMMA: "Herr"} - ], - - "Hrn.": [ - {ORTH: "Hrn.", LEMMA: "Herr"} - ], - - "Jan.": [ - {ORTH: "Jan.", LEMMA: "Januar"} - ], - - "Jh.": [ - {ORTH: "Jh.", LEMMA: "Jahrhundert"} - ], - - "Jhd.": [ - {ORTH: "Jhd.", LEMMA: "Jahrhundert"} - ], - - "Jul.": [ - {ORTH: "Jul.", LEMMA: "Juli"} - ], - - "Jun.": [ - {ORTH: "Jun.", LEMMA: "Juni"} - ], - - "Mi.": [ - {ORTH: "Mi.", LEMMA: "Mittwoch"} - ], - - "Mio.": [ - {ORTH: "Mio.", LEMMA: "Million"} - ], - - "Mo.": [ - {ORTH: "Mo.", LEMMA: "Montag"} - ], - - "Mrd.": [ - {ORTH: "Mrd.", LEMMA: "Milliarde"} - ], - - "Mrz.": [ - {ORTH: "Mrz.", LEMMA: "März"} - ], - - "MwSt.": [ - {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"} - ], - - "Mär.": [ - {ORTH: "Mär.", LEMMA: "März"} - ], - - "Nov.": [ - {ORTH: "Nov.", LEMMA: "November"} - ], - - "Nr.": [ - {ORTH: "Nr.", LEMMA: "Nummer"} - ], - - "Okt.": [ - {ORTH: "Okt.", LEMMA: "Oktober"} - ], - - "Orig.": [ - {ORTH: "Orig.", LEMMA: "Original"} - ], - - "Pkt.": [ - {ORTH: "Pkt.", LEMMA: "Punkt"} - ], - - "Prof.": [ - {ORTH: "Prof.", LEMMA: "Professor"} - ], - - "Red.": [ - {ORTH: "Red.", LEMMA: "Redaktion"} - ], - - "S'": [ - {ORTH: "S'", LEMMA: PRON_LEMMA} - ], - - "Sa.": [ - {ORTH: "Sa.", LEMMA: "Samstag"} - ], - - "Sep.": [ - {ORTH: "Sep.", LEMMA: "September"} - ], - - "Sept.": [ - {ORTH: "Sept.", LEMMA: "September"} - ], - - "So.": [ - {ORTH: "So.", LEMMA: "Sonntag"} - ], - - "Std.": [ - {ORTH: "Std.", LEMMA: "Stunde"} - ], - - "Str.": [ - {ORTH: "Str.", LEMMA: "Straße"} - ], - - "Tel.": [ - {ORTH: "Tel.", LEMMA: "Telefon"} - ], - - "Tsd.": [ - {ORTH: "Tsd.", LEMMA: "Tausend"} - ], - - "Univ.": [ - {ORTH: "Univ.", LEMMA: "Universität"} - ], - - "abzgl.": [ - {ORTH: "abzgl.", LEMMA: "abzüglich"} - ], - - "allg.": [ - {ORTH: "allg.", LEMMA: "allgemein"} - ], - - "auf'm": [ - {ORTH: "auf", LEMMA: "auf"}, - {ORTH: "'m", LEMMA: PRON_LEMMA} - ], - - "bspw.": [ - {ORTH: "bspw.", LEMMA: "beispielsweise"} - ], - - "bzgl.": [ - {ORTH: "bzgl.", LEMMA: "bezüglich"} - ], - - "bzw.": [ - {ORTH: "bzw.", LEMMA: "beziehungsweise"} - ], - - "d.h.": [ - {ORTH: "d.h.", LEMMA: "das heißt"} - ], - - "dgl.": [ - {ORTH: "dgl.", LEMMA: "dergleichen"} - ], - - "du's": [ - {ORTH: "du", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "ebd.": [ - {ORTH: "ebd.", LEMMA: "ebenda"} - ], - - "eigtl.": [ - {ORTH: "eigtl.", LEMMA: "eigentlich"} - ], - - "engl.": [ - {ORTH: "engl.", LEMMA: "englisch"} - ], - - "er's": [ - {ORTH: "er", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "evtl.": [ - {ORTH: "evtl.", LEMMA: "eventuell"} - ], - - "frz.": [ - {ORTH: "frz.", LEMMA: "französisch"} - ], - - "gegr.": [ - {ORTH: "gegr.", LEMMA: "gegründet"} - ], - - "ggf.": [ - {ORTH: "ggf.", LEMMA: "gegebenenfalls"} - ], - - "ggfs.": [ - {ORTH: "ggfs.", LEMMA: "gegebenenfalls"} - ], - - "ggü.": [ - {ORTH: "ggü.", LEMMA: "gegenüber"} - ], - - "hinter'm": [ - {ORTH: "hinter", LEMMA: "hinter"}, - {ORTH: "'m", LEMMA: PRON_LEMMA} - ], - - "i.O.": [ - {ORTH: "i.O.", LEMMA: "in Ordnung"} - ], - - "i.d.R.": [ - {ORTH: "i.d.R.", LEMMA: "in der Regel"} - ], - - "ich's": [ - {ORTH: "ich", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "ihr's": [ - {ORTH: "ihr", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "incl.": [ - {ORTH: "incl.", LEMMA: "inklusive"} - ], - - "inkl.": [ - {ORTH: "inkl.", LEMMA: "inklusive"} - ], - - "insb.": [ - {ORTH: "insb.", LEMMA: "insbesondere"} - ], - - "kath.": [ - {ORTH: "kath.", LEMMA: "katholisch"} - ], - - "lt.": [ - {ORTH: "lt.", LEMMA: "laut"} - ], - - "max.": [ - {ORTH: "max.", LEMMA: "maximal"} - ], - - "min.": [ - {ORTH: "min.", LEMMA: "minimal"} - ], - - "mind.": [ - {ORTH: "mind.", LEMMA: "mindestens"} - ], - - "mtl.": [ - {ORTH: "mtl.", LEMMA: "monatlich"} - ], - - "n.Chr.": [ - {ORTH: "n.Chr.", LEMMA: "nach Christus"} - ], - - "orig.": [ - {ORTH: "orig.", LEMMA: "original"} - ], - - "röm.": [ - {ORTH: "röm.", LEMMA: "römisch"} - ], - - "s'": [ - {ORTH: "s'", LEMMA: PRON_LEMMA} - ], - - "s.o.": [ - {ORTH: "s.o.", LEMMA: "siehe oben"} - ], - - "sie's": [ - {ORTH: "sie", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "sog.": [ - {ORTH: "sog.", LEMMA: "so genannt"} - ], - - "stellv.": [ - {ORTH: "stellv.", LEMMA: "stellvertretend"} - ], - - "tägl.": [ - {ORTH: "tägl.", LEMMA: "täglich"} - ], - - "u.U.": [ - {ORTH: "u.U.", LEMMA: "unter Umständen"} - ], - - "u.s.w.": [ - {ORTH: "u.s.w.", LEMMA: "und so weiter"} - ], - - "u.v.m.": [ - {ORTH: "u.v.m.", LEMMA: "und vieles mehr"} - ], - - "unter'm": [ - {ORTH: "unter", LEMMA: "unter"}, - {ORTH: "'m", LEMMA: PRON_LEMMA} - ], - - "usf.": [ - {ORTH: "usf.", LEMMA: "und so fort"} - ], - - "usw.": [ - {ORTH: "usw.", LEMMA: "und so weiter"} - ], - - "uvm.": [ - {ORTH: "uvm.", LEMMA: "und vieles mehr"} - ], - - "v.Chr.": [ - {ORTH: "v.Chr.", LEMMA: "vor Christus"} - ], - - "v.a.": [ - {ORTH: "v.a.", LEMMA: "vor allem"} - ], - - "v.l.n.r.": [ - {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"} - ], - - "vgl.": [ - {ORTH: "vgl.", LEMMA: "vergleiche"} - ], - - "vllt.": [ - {ORTH: "vllt.", LEMMA: "vielleicht"} - ], - - "vlt.": [ - {ORTH: "vlt.", LEMMA: "vielleicht"} - ], - - "vor'm": [ - {ORTH: "vor", LEMMA: "vor"}, - {ORTH: "'m", LEMMA: PRON_LEMMA} - ], - - "wir's": [ - {ORTH: "wir", LEMMA: PRON_LEMMA}, - {ORTH: "'s", LEMMA: PRON_LEMMA} - ], - - "z.B.": [ - {ORTH: "z.B.", LEMMA: "zum Beispiel"} - ], - - "z.Bsp.": [ - {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"} - ], - - "z.T.": [ - {ORTH: "z.T.", LEMMA: "zum Teil"} - ], - - "z.Z.": [ - {ORTH: "z.Z.", LEMMA: "zur Zeit"} - ], - - "z.Zt.": [ - {ORTH: "z.Zt.", LEMMA: "zur Zeit"} - ], - - "z.b.": [ - {ORTH: "z.b.", LEMMA: "zum Beispiel"} - ], - - "zzgl.": [ - {ORTH: "zzgl.", LEMMA: "zuzüglich"} - ], - - "österr.": [ - {ORTH: "österr.", LEMMA: "österreichisch"} - ], - - "über'm": [ - {ORTH: "über", LEMMA: "über"}, - {ORTH: "'m", LEMMA: PRON_LEMMA} - ] -} - - -ORTH_ONLY = [ - "'", - "\\\")", - "", - "a.", - "ä.", - "A.C.", - "a.D.", - "A.D.", - "A.G.", - "a.M.", - "a.Z.", - "Abs.", - "adv.", - "al.", - "b.", - "B.A.", - "B.Sc.", - "betr.", - "biol.", - "Biol.", - "c.", - "ca.", - "Chr.", - "Cie.", - "co.", - "Co.", - "d.", - "D.C.", - "Dipl.-Ing.", - "Dipl.", - "Dr.", - "e.", - "e.g.", - "e.V.", - "ehem.", - "entspr.", - "erm.", - "etc.", - "ev.", - "f.", - "g.", - "G.m.b.H.", - "geb.", - "Gebr.", - "gem.", - "h.", - "h.c.", - "Hg.", - "hrsg.", - "Hrsg.", - "i.", - "i.A.", - "i.e.", - "i.G.", - "i.Tr.", - "i.V.", - "Ing.", - "j.", - "jr.", - "Jr.", - "jun.", - "jur.", - "k.", - "K.O.", - "l.", - "L.A.", - "lat.", - "m.", - "M.A.", - "m.E.", - "m.M.", - "M.Sc.", - "Mr.", - "n.", - "N.Y.", - "N.Y.C.", - "nat.", - "ö." - "o.", - "o.a.", - "o.ä.", - "o.g.", - "o.k.", - "O.K.", - "p.", - "p.a.", - "p.s.", - "P.S.", - "pers.", - "phil.", - "q.", - "q.e.d.", - "r.", - "R.I.P.", - "rer.", - "s.", - "sen.", - "St.", - "std.", - "t.", - "u.", - "ü.", - "u.a.", - "U.S.", - "U.S.A.", - "U.S.S.", - "v.", - "Vol.", - "vs.", - "w.", - "wiss.", - "x.", - "y.", - "z.", -] +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY diff --git a/spacy/de/stop_words.py b/spacy/de/stop_words.py new file mode 100644 index 000000000..66a89eee8 --- /dev/null +++ b/spacy/de/stop_words.py @@ -0,0 +1,81 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +á a ab aber ach acht achte achten achter achtes ag alle allein allem allen +aller allerdings alles allgemeinen als also am an andere anderen andern anders +auch auf aus ausser außer ausserdem außerdem + +bald bei beide beiden beim beispiel bekannt bereits besonders besser besten bin +bis bisher bist + +da dabei dadurch dafür dagegen daher dahin dahinter damals damit danach daneben +dank dann daran darauf daraus darf darfst darin darüber darum darunter das +dasein daselbst dass daß dasselbe davon davor dazu dazwischen dein deine deinem +deiner dem dementsprechend demgegenüber demgemäss demgemäß demselben demzufolge +den denen denn denselben der deren derjenige derjenigen dermassen dermaßen +derselbe derselben des deshalb desselben dessen deswegen dich die diejenige +diejenigen dies diese dieselbe dieselben diesem diesen dieser dieses dir doch +dort drei drin dritte dritten dritter drittes du durch durchaus dürfen dürft +durfte durften + +eben ebenso ehrlich eigen eigene eigenen eigener eigenes ein einander eine +einem einen einer eines einigeeinigen einiger einiges einmal einmaleins elf en +ende endlich entweder er erst erste ersten erster erstes es etwa etwas euch + +früher fünf fünfte fünften fünfter fünftes für + +gab ganz ganze ganzen ganzer ganzes gar gedurft gegen gegenüber gehabt gehen +geht gekannt gekonnt gemacht gemocht gemusst genug gerade gern gesagt geschweige +gewesen gewollt geworden gibt ging gleich gott gross groß grosse große grossen +großen grosser großer grosses großes gut gute guter gutes + +habe haben habt hast hat hatte hätte hatten hätten heisst heißt her heute hier +hin hinter hoch + +ich ihm ihn ihnen ihr ihre ihrem ihrer ihres im immer in indem infolgedessen +ins irgend ist + +ja jahr jahre jahren je jede jedem jeden jeder jedermann jedermanns jedoch +jemand jemandem jemanden jene jenem jenen jener jenes jetzt + +kam kann kannst kaum kein keine keinem keinen keiner kleine kleinen kleiner +kleines kommen kommt können könnt konnte könnte konnten kurz + +lang lange leicht leider lieber los + +machen macht machte mag magst man manche manchem manchen mancher manches mehr +mein meine meinem meinen meiner meines mensch menschen mich mir mit mittel +mochte möchte mochten mögen möglich mögt morgen muss muß müssen musst müsst +musste mussten + +na nach nachdem nahm natürlich neben nein neue neuen neun neunte neunten neunter +neuntes nicht nichts nie niemand niemandem niemanden noch nun nur + +ob oben oder offen oft ohne + +recht rechte rechten rechter rechtes richtig rund + +sagt sagte sah satt schlecht schon sechs sechste sechsten sechster sechstes +sehr sei seid seien sein seine seinem seinen seiner seines seit seitdem selbst +selbst sich sie sieben siebente siebenten siebenter siebentes siebte siebten +siebter siebtes sind so solang solche solchem solchen solcher solches soll +sollen sollte sollten sondern sonst sowie später statt + +tag tage tagen tat teil tel trotzdem tun + +über überhaupt übrigens uhr um und uns unser unsere unserer unter + +vergangene vergangenen viel viele vielem vielen vielleicht vier vierte vierten +vierter viertes vom von vor + +wahr während währenddem währenddessen wann war wäre waren wart warum was wegen +weil weit weiter weitere weiteren weiteres welche welchem welchen welcher +welches wem wen wenig wenige weniger weniges wenigstens wenn wer werde werden +werdet wessen wie wieder will willst wir wird wirklich wirst wo wohl wollen +wollt wollte wollten worden wurde würde wurden würden + +zehn zehnte zehnten zehnter zehntes zeit zu zuerst zugleich zum zunächst zur +zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen +""".split()) diff --git a/spacy/de/tag_map.py b/spacy/de/tag_map.py new file mode 100644 index 000000000..e5996b38c --- /dev/null +++ b/spacy/de/tag_map.py @@ -0,0 +1,65 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "$(": {POS: PUNCT, "PunctType": "brck"}, + "$,": {POS: PUNCT, "PunctType": "comm"}, + "$.": {POS: PUNCT, "PunctType": "peri"}, + "ADJA": {POS: ADJ}, + "ADJD": {POS: ADJ, "Variant": "short"}, + "ADV": {POS: ADV}, + "APPO": {POS: ADP, "AdpType": "post"}, + "APPR": {POS: ADP, "AdpType": "prep"}, + "APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"}, + "APZR": {POS: ADP, "AdpType": "circ"}, + "ART": {POS: DET, "PronType": "art"}, + "CARD": {POS: NUM, "NumType": "card"}, + "FM": {POS: X, "Foreign": "yes"}, + "ITJ": {POS: INTJ}, + "KOKOM": {POS: CONJ, "ConjType": "comp"}, + "KON": {POS: CONJ}, + "KOUI": {POS: SCONJ}, + "KOUS": {POS: SCONJ}, + "NE": {POS: PROPN}, + "NNE": {POS: PROPN}, + "NN": {POS: NOUN}, + "PAV": {POS: ADV, "PronType": "dem"}, + "PROAV": {POS: ADV, "PronType": "dem"}, + "PDAT": {POS: DET, "PronType": "dem"}, + "PDS": {POS: PRON, "PronType": "dem"}, + "PIAT": {POS: DET, "PronType": "ind|neg|tot"}, + "PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, + "PIS": {POS: PRON, "PronType": "ind|neg|tot"}, + "PPER": {POS: PRON, "PronType": "prs"}, + "PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"}, + "PPOSS": {POS: PRON, "Poss": "yes", "PronType": "prs"}, + "PRELAT": {POS: DET, "PronType": "rel"}, + "PRELS": {POS: PRON, "PronType": "rel"}, + "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, + "PTKA": {POS: PART}, + "PTKANT": {POS: PART, "PartType": "res"}, + "PTKNEG": {POS: PART, "Negative": "yes"}, + "PTKVZ": {POS: PART, "PartType": "vbp"}, + "PTKZU": {POS: PART, "PartType": "inf"}, + "PWAT": {POS: DET, "PronType": "int"}, + "PWAV": {POS: ADV, "PronType": "int"}, + "PWS": {POS: PRON, "PronType": "int"}, + "TRUNC": {POS: X, "Hyph": "yes"}, + "VAFIN": {POS: AUX, "Mood": "ind", "VerbForm": "fin"}, + "VAIMP": {POS: AUX, "Mood": "imp", "VerbForm": "fin"}, + "VAINF": {POS: AUX, "VerbForm": "inf"}, + "VAPP": {POS: AUX, "Aspect": "perf", "VerbForm": "part"}, + "VMFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin", "VerbType": "mod"}, + "VMINF": {POS: VERB, "VerbForm": "inf", "VerbType": "mod"}, + "VMPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part", "VerbType": "mod"}, + "VVFIN": {POS: VERB, "Mood": "ind", "VerbForm": "fin"}, + "VVIMP": {POS: VERB, "Mood": "imp", "VerbForm": "fin"}, + "VVINF": {POS: VERB, "VerbForm": "inf"}, + "VVIZU": {POS: VERB, "VerbForm": "inf"}, + "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, + "XY": {POS: X}, + "SP": {POS: SPACE} +} diff --git a/spacy/de/tokenizer_exceptions.py b/spacy/de/tokenizer_exceptions.py new file mode 100644 index 000000000..d7d9a2f3a --- /dev/null +++ b/spacy/de/tokenizer_exceptions.py @@ -0,0 +1,629 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "\\n": [ + {ORTH: "\\n", LEMMA: "", TAG: "SP"} + ], + + "\\t": [ + {ORTH: "\\t", LEMMA: "", TAG: "SP"} + ], + + "'S": [ + {ORTH: "'S", LEMMA: PRON_LEMMA} + ], + + "'n": [ + {ORTH: "'n", LEMMA: "ein"} + ], + + "'ne": [ + {ORTH: "'ne", LEMMA: "eine"} + ], + + "'nen": [ + {ORTH: "'nen", LEMMA: "einen"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "Abb.": [ + {ORTH: "Abb.", LEMMA: "Abbildung"} + ], + + "Abk.": [ + {ORTH: "Abk.", LEMMA: "Abkürzung"} + ], + + "Abt.": [ + {ORTH: "Abt.", LEMMA: "Abteilung"} + ], + + "Apr.": [ + {ORTH: "Apr.", LEMMA: "April"} + ], + + "Aug.": [ + {ORTH: "Aug.", LEMMA: "August"} + ], + + "Bd.": [ + {ORTH: "Bd.", LEMMA: "Band"} + ], + + "Betr.": [ + {ORTH: "Betr.", LEMMA: "Betreff"} + ], + + "Bf.": [ + {ORTH: "Bf.", LEMMA: "Bahnhof"} + ], + + "Bhf.": [ + {ORTH: "Bhf.", LEMMA: "Bahnhof"} + ], + + "Bsp.": [ + {ORTH: "Bsp.", LEMMA: "Beispiel"} + ], + + "Dez.": [ + {ORTH: "Dez.", LEMMA: "Dezember"} + ], + + "Di.": [ + {ORTH: "Di.", LEMMA: "Dienstag"} + ], + + "Do.": [ + {ORTH: "Do.", LEMMA: "Donnerstag"} + ], + + "Fa.": [ + {ORTH: "Fa.", LEMMA: "Firma"} + ], + + "Fam.": [ + {ORTH: "Fam.", LEMMA: "Familie"} + ], + + "Feb.": [ + {ORTH: "Feb.", LEMMA: "Februar"} + ], + + "Fr.": [ + {ORTH: "Fr.", LEMMA: "Frau"} + ], + + "Frl.": [ + {ORTH: "Frl.", LEMMA: "Fräulein"} + ], + + "Hbf.": [ + {ORTH: "Hbf.", LEMMA: "Hauptbahnhof"} + ], + + "Hr.": [ + {ORTH: "Hr.", LEMMA: "Herr"} + ], + + "Hrn.": [ + {ORTH: "Hrn.", LEMMA: "Herr"} + ], + + "Jan.": [ + {ORTH: "Jan.", LEMMA: "Januar"} + ], + + "Jh.": [ + {ORTH: "Jh.", LEMMA: "Jahrhundert"} + ], + + "Jhd.": [ + {ORTH: "Jhd.", LEMMA: "Jahrhundert"} + ], + + "Jul.": [ + {ORTH: "Jul.", LEMMA: "Juli"} + ], + + "Jun.": [ + {ORTH: "Jun.", LEMMA: "Juni"} + ], + + "Mi.": [ + {ORTH: "Mi.", LEMMA: "Mittwoch"} + ], + + "Mio.": [ + {ORTH: "Mio.", LEMMA: "Million"} + ], + + "Mo.": [ + {ORTH: "Mo.", LEMMA: "Montag"} + ], + + "Mrd.": [ + {ORTH: "Mrd.", LEMMA: "Milliarde"} + ], + + "Mrz.": [ + {ORTH: "Mrz.", LEMMA: "März"} + ], + + "MwSt.": [ + {ORTH: "MwSt.", LEMMA: "Mehrwertsteuer"} + ], + + "Mär.": [ + {ORTH: "Mär.", LEMMA: "März"} + ], + + "Nov.": [ + {ORTH: "Nov.", LEMMA: "November"} + ], + + "Nr.": [ + {ORTH: "Nr.", LEMMA: "Nummer"} + ], + + "Okt.": [ + {ORTH: "Okt.", LEMMA: "Oktober"} + ], + + "Orig.": [ + {ORTH: "Orig.", LEMMA: "Original"} + ], + + "Pkt.": [ + {ORTH: "Pkt.", LEMMA: "Punkt"} + ], + + "Prof.": [ + {ORTH: "Prof.", LEMMA: "Professor"} + ], + + "Red.": [ + {ORTH: "Red.", LEMMA: "Redaktion"} + ], + + "S'": [ + {ORTH: "S'", LEMMA: PRON_LEMMA} + ], + + "Sa.": [ + {ORTH: "Sa.", LEMMA: "Samstag"} + ], + + "Sep.": [ + {ORTH: "Sep.", LEMMA: "September"} + ], + + "Sept.": [ + {ORTH: "Sept.", LEMMA: "September"} + ], + + "So.": [ + {ORTH: "So.", LEMMA: "Sonntag"} + ], + + "Std.": [ + {ORTH: "Std.", LEMMA: "Stunde"} + ], + + "Str.": [ + {ORTH: "Str.", LEMMA: "Straße"} + ], + + "Tel.": [ + {ORTH: "Tel.", LEMMA: "Telefon"} + ], + + "Tsd.": [ + {ORTH: "Tsd.", LEMMA: "Tausend"} + ], + + "Univ.": [ + {ORTH: "Univ.", LEMMA: "Universität"} + ], + + "abzgl.": [ + {ORTH: "abzgl.", LEMMA: "abzüglich"} + ], + + "allg.": [ + {ORTH: "allg.", LEMMA: "allgemein"} + ], + + "auf'm": [ + {ORTH: "auf", LEMMA: "auf"}, + {ORTH: "'m", LEMMA: PRON_LEMMA} + ], + + "bspw.": [ + {ORTH: "bspw.", LEMMA: "beispielsweise"} + ], + + "bzgl.": [ + {ORTH: "bzgl.", LEMMA: "bezüglich"} + ], + + "bzw.": [ + {ORTH: "bzw.", LEMMA: "beziehungsweise"} + ], + + "d.h.": [ + {ORTH: "d.h.", LEMMA: "das heißt"} + ], + + "dgl.": [ + {ORTH: "dgl.", LEMMA: "dergleichen"} + ], + + "du's": [ + {ORTH: "du", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "ebd.": [ + {ORTH: "ebd.", LEMMA: "ebenda"} + ], + + "eigtl.": [ + {ORTH: "eigtl.", LEMMA: "eigentlich"} + ], + + "engl.": [ + {ORTH: "engl.", LEMMA: "englisch"} + ], + + "er's": [ + {ORTH: "er", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "evtl.": [ + {ORTH: "evtl.", LEMMA: "eventuell"} + ], + + "frz.": [ + {ORTH: "frz.", LEMMA: "französisch"} + ], + + "gegr.": [ + {ORTH: "gegr.", LEMMA: "gegründet"} + ], + + "ggf.": [ + {ORTH: "ggf.", LEMMA: "gegebenenfalls"} + ], + + "ggfs.": [ + {ORTH: "ggfs.", LEMMA: "gegebenenfalls"} + ], + + "ggü.": [ + {ORTH: "ggü.", LEMMA: "gegenüber"} + ], + + "hinter'm": [ + {ORTH: "hinter", LEMMA: "hinter"}, + {ORTH: "'m", LEMMA: PRON_LEMMA} + ], + + "i.O.": [ + {ORTH: "i.O.", LEMMA: "in Ordnung"} + ], + + "i.d.R.": [ + {ORTH: "i.d.R.", LEMMA: "in der Regel"} + ], + + "ich's": [ + {ORTH: "ich", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "ihr's": [ + {ORTH: "ihr", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "incl.": [ + {ORTH: "incl.", LEMMA: "inklusive"} + ], + + "inkl.": [ + {ORTH: "inkl.", LEMMA: "inklusive"} + ], + + "insb.": [ + {ORTH: "insb.", LEMMA: "insbesondere"} + ], + + "kath.": [ + {ORTH: "kath.", LEMMA: "katholisch"} + ], + + "lt.": [ + {ORTH: "lt.", LEMMA: "laut"} + ], + + "max.": [ + {ORTH: "max.", LEMMA: "maximal"} + ], + + "min.": [ + {ORTH: "min.", LEMMA: "minimal"} + ], + + "mind.": [ + {ORTH: "mind.", LEMMA: "mindestens"} + ], + + "mtl.": [ + {ORTH: "mtl.", LEMMA: "monatlich"} + ], + + "n.Chr.": [ + {ORTH: "n.Chr.", LEMMA: "nach Christus"} + ], + + "orig.": [ + {ORTH: "orig.", LEMMA: "original"} + ], + + "röm.": [ + {ORTH: "röm.", LEMMA: "römisch"} + ], + + "s'": [ + {ORTH: "s'", LEMMA: PRON_LEMMA} + ], + + "s.o.": [ + {ORTH: "s.o.", LEMMA: "siehe oben"} + ], + + "sie's": [ + {ORTH: "sie", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "sog.": [ + {ORTH: "sog.", LEMMA: "so genannt"} + ], + + "stellv.": [ + {ORTH: "stellv.", LEMMA: "stellvertretend"} + ], + + "tägl.": [ + {ORTH: "tägl.", LEMMA: "täglich"} + ], + + "u.U.": [ + {ORTH: "u.U.", LEMMA: "unter Umständen"} + ], + + "u.s.w.": [ + {ORTH: "u.s.w.", LEMMA: "und so weiter"} + ], + + "u.v.m.": [ + {ORTH: "u.v.m.", LEMMA: "und vieles mehr"} + ], + + "unter'm": [ + {ORTH: "unter", LEMMA: "unter"}, + {ORTH: "'m", LEMMA: PRON_LEMMA} + ], + + "usf.": [ + {ORTH: "usf.", LEMMA: "und so fort"} + ], + + "usw.": [ + {ORTH: "usw.", LEMMA: "und so weiter"} + ], + + "uvm.": [ + {ORTH: "uvm.", LEMMA: "und vieles mehr"} + ], + + "v.Chr.": [ + {ORTH: "v.Chr.", LEMMA: "vor Christus"} + ], + + "v.a.": [ + {ORTH: "v.a.", LEMMA: "vor allem"} + ], + + "v.l.n.r.": [ + {ORTH: "v.l.n.r.", LEMMA: "von links nach rechts"} + ], + + "vgl.": [ + {ORTH: "vgl.", LEMMA: "vergleiche"} + ], + + "vllt.": [ + {ORTH: "vllt.", LEMMA: "vielleicht"} + ], + + "vlt.": [ + {ORTH: "vlt.", LEMMA: "vielleicht"} + ], + + "vor'm": [ + {ORTH: "vor", LEMMA: "vor"}, + {ORTH: "'m", LEMMA: PRON_LEMMA} + ], + + "wir's": [ + {ORTH: "wir", LEMMA: PRON_LEMMA}, + {ORTH: "'s", LEMMA: PRON_LEMMA} + ], + + "z.B.": [ + {ORTH: "z.B.", LEMMA: "zum Beispiel"} + ], + + "z.Bsp.": [ + {ORTH: "z.Bsp.", LEMMA: "zum Beispiel"} + ], + + "z.T.": [ + {ORTH: "z.T.", LEMMA: "zum Teil"} + ], + + "z.Z.": [ + {ORTH: "z.Z.", LEMMA: "zur Zeit"} + ], + + "z.Zt.": [ + {ORTH: "z.Zt.", LEMMA: "zur Zeit"} + ], + + "z.b.": [ + {ORTH: "z.b.", LEMMA: "zum Beispiel"} + ], + + "zzgl.": [ + {ORTH: "zzgl.", LEMMA: "zuzüglich"} + ], + + "österr.": [ + {ORTH: "österr.", LEMMA: "österreichisch"} + ], + + "über'm": [ + {ORTH: "über", LEMMA: "über"}, + {ORTH: "'m", LEMMA: PRON_LEMMA} + ] +} + + +ORTH_ONLY = [ + "'", + "\\\")", + "", + "a.", + "ä.", + "A.C.", + "a.D.", + "A.D.", + "A.G.", + "a.M.", + "a.Z.", + "Abs.", + "adv.", + "al.", + "b.", + "B.A.", + "B.Sc.", + "betr.", + "biol.", + "Biol.", + "c.", + "ca.", + "Chr.", + "Cie.", + "co.", + "Co.", + "d.", + "D.C.", + "Dipl.-Ing.", + "Dipl.", + "Dr.", + "e.", + "e.g.", + "e.V.", + "ehem.", + "entspr.", + "erm.", + "etc.", + "ev.", + "f.", + "g.", + "G.m.b.H.", + "geb.", + "Gebr.", + "gem.", + "h.", + "h.c.", + "Hg.", + "hrsg.", + "Hrsg.", + "i.", + "i.A.", + "i.e.", + "i.G.", + "i.Tr.", + "i.V.", + "Ing.", + "j.", + "jr.", + "Jr.", + "jun.", + "jur.", + "k.", + "K.O.", + "l.", + "L.A.", + "lat.", + "m.", + "M.A.", + "m.E.", + "m.M.", + "M.Sc.", + "Mr.", + "n.", + "N.Y.", + "N.Y.C.", + "nat.", + "ö." + "o.", + "o.a.", + "o.ä.", + "o.g.", + "o.k.", + "O.K.", + "p.", + "p.a.", + "p.s.", + "P.S.", + "pers.", + "phil.", + "q.", + "q.e.d.", + "r.", + "R.I.P.", + "rer.", + "s.", + "sen.", + "St.", + "std.", + "t.", + "u.", + "ü.", + "u.a.", + "U.S.", + "U.S.A.", + "U.S.S.", + "v.", + "Vol.", + "vs.", + "w.", + "wiss.", + "x.", + "y.", + "z.", +] From e6fc4afb04a0e6f442793b6b420d4a0e381cd758 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 15:48:00 +0100 Subject: [PATCH 18/31] Whitespace --- spacy/morphology.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fbcbc2e66..fb6273753 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -140,6 +140,7 @@ cdef class Morphology: lemma = self.strings[lemma_string] return lemma + IDS = { "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam, From 44f4f008bd7df82647b215466c6b268847f53f44 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 15:50:09 +0100 Subject: [PATCH 19/31] Wire up lemmatizer rules for English --- spacy/en/__init__.py | 3 +++ spacy/language.py | 4 +++- spacy/lemmatizer.py | 9 ++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 623585f7d..6e706db52 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -19,6 +19,8 @@ from ..language_data import EMOTICONS from .language_data import ORTH_ONLY from .language_data import get_time_exc +from .lemma_rules import LEMMA_RULES + TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) @@ -47,3 +49,4 @@ class English(Language): infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES diff --git a/spacy/language.py b/spacy/language.py index 78dbac953..7d3d91846 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -38,7 +38,7 @@ class BaseDefaults(object): if nlp is None or nlp.path is None: return Lemmatizer({}, {}, {}) else: - return Lemmatizer.load(nlp.path) + return Lemmatizer.load(nlp.path, rules=self.lemma_rules) @classmethod def create_vocab(cls, nlp=None): @@ -159,6 +159,8 @@ class BaseDefaults(object): stop_words = set() + lemma_rules = {} + lex_attr_getters = { attrs.LOWER: lambda string: string.lower(), attrs.NORM: lambda string: string, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a79ecb009..960467a0b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod - def load(cls, path): + def load(cls, path, rules=None): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: @@ -25,8 +25,11 @@ class Lemmatizer(object): exc[pos] = read_exc(file_) else: exc[pos] = {} - with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: - rules = json.load(file_) + if rules is None and (path / 'vocab' / 'lemma_rules.json').exists(): + with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: + rules = json.load(file_) + elif rules is None: + rules = {} return cls(index, exc, rules) def __init__(self, index, exceptions, rules): From 837a5d41003ef3d2ec855e8d96caf30e958fcd65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:49:46 +0100 Subject: [PATCH 20/31] Update morphology class so that exceptions can be added one-by-one, and so that arbitrary attributes can be referenced. --- spacy/morphology.pyx | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fb6273753..bd02d0489 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -9,10 +9,11 @@ try: except ImportError: import json -from .parts_of_speech import IDS as POS_IDS from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .attrs cimport POS, IS_SPACE +from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme +from .attrs import intify_attrs def _normalize_props(props): @@ -32,6 +33,7 @@ def _normalize_props(props): return out + cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() @@ -43,12 +45,13 @@ cdef class Morphology: self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) - for i, (tag_str, props) in enumerate(sorted(tag_map.items())): - props = _normalize_props(props) + for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + attrs = _normalize_props(attrs) + attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = props[POS] + self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i self._cache = PreshMapArray(self.n_tags) @@ -85,10 +88,14 @@ cdef class Morphology: token.tag = analysis.tag.name token.morph = analysis.tag.morph - cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: - pass + cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1: + cdef flags_t one = 1 + if value: + flags[0] |= one << flag_id + else: + flags[0] &= ~(one << flag_id) - def add_special_case(self, unicode tag_str, unicode orth_str, props, force=False): + def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): '''Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. @@ -100,13 +107,13 @@ cdef class Morphology: tag_id = self.reverse_index[tag] orth = self.strings[orth_str] rich_tag = self.rich_tags[tag_id] - props = _normalize_props(props) + attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) cached = self._cache.get(tag_id, orth) if cached is NULL: cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) elif force: - memset(cached, 0, sizeof(cached)) + memset(cached, 0, sizeof(cached[0])) else: msg = ("Conflicting morphology exception for (%s, %s). Use force=True " "to overwrite.") @@ -114,8 +121,8 @@ cdef class Morphology: raise ValueError(msg) cached.tag = rich_tag - for name_str, value_str in props.items(): - self.assign_feature(&cached.tag.morph, name_str, value_str) + for name_id, value_id in attrs.items(): + self.assign_feature(&cached.tag.morph, name_id, value_id) if cached.lemma == 0: cached.lemma = self.lemmatize(rich_tag.pos, orth, self.tag_map.get(tag_str, {})) @@ -124,8 +131,8 @@ cdef class Morphology: def load_morph_exceptions(self, dict exc): # Map (form, pos) to (lemma, rich tag) for tag_str, entries in exc.items(): - for form_str, props in entries.items(): - self.add_special_case(tag_str, form_str, props) + for form_str, attrs in entries.items(): + self.add_special_case(tag_str, form_str, attrs) def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): cdef unicode py_string = self.strings[orth] From d58187ffa797614b657c21ab9e7c55aab2a4f7ab Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:50:26 +0100 Subject: [PATCH 21/31] Filter out morphology keys in deprecated attrs --- spacy/attrs.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index ce4005324..ddcbdff64 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -120,8 +120,14 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): stringy_attrs.pop('number') if 'tenspect' in stringy_attrs: stringy_attrs.pop('tenspect') - # for name, value in morphs.items(): - # stringy_attrs[name] = value + morph_keys = [ + 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', + 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', + 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', + 'Number', 'PronType', 'AdjType', 'Person'] + for key in morph_keys: + if key in stringy_attrs: + stringy_attrs.pop(key) for name, value in stringy_attrs.items(): if isinstance(name, int): int_key = name From 33996e770bde8fe169f08db7dcb5740acea6b2bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:50:42 +0100 Subject: [PATCH 22/31] Update header for morphology class --- spacy/morphology.pxd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index aa45c47f0..5dc1ce529 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -4,11 +4,12 @@ from libc.stdint cimport uint64_t from .structs cimport TokenC from .strings cimport StringStore -from .typedefs cimport attr_t +from .typedefs cimport attr_t, flags_t from .parts_of_speech cimport univ_pos_t from . cimport symbols + cdef struct RichTagC: uint64_t morph int id @@ -37,7 +38,7 @@ cdef class Morphology: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cpdef enum univ_morph_t: From 6ee1df93c5a571e3e3decee19a6b2e2a46d1618d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:51:10 +0100 Subject: [PATCH 23/31] Set tag_map to None if it's not seen in the data by vocab --- spacy/vocab.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 0a37b5c3b..e61c7f2ba 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -83,6 +83,8 @@ cdef class Vocab: if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_: tag_map = json.load(file_) + elif tag_map is True: + tag_map = None if lex_attr_getters is not None \ and oov_prob is True \ and (path / 'vocab' / 'oov_prob').exists(): From bdcecb3c96cef5b663b1ada22efa952b0882f1f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:51:31 +0100 Subject: [PATCH 24/31] Add import in regression test --- spacy/tests/regression/test_issue600.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue600.py b/spacy/tests/regression/test_issue600.py index 2ab8e9c25..90e700aed 100644 --- a/spacy/tests/regression/test_issue600.py +++ b/spacy/tests/regression/test_issue600.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from ...tokens import Doc from ...vocab import Vocab +from ...attrs import POS def test_issue600(): From 1b31c05bf891c9b96b412404c78cffd626f05c55 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 16:51:40 +0100 Subject: [PATCH 25/31] Whitespace --- spacy/attrs.pxd | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index a878a49d8..073de3565 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -87,5 +87,3 @@ cpdef enum attr_id_t: PROB LANG - - From 2b2ea8ca11d1658325a855861150e2dbd5d709fb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:54:19 +0100 Subject: [PATCH 26/31] Reorganise language data --- spacy/de/__init__.py | 22 +- spacy/de/language_data.py | 19 +- spacy/en/__init__.py | 28 +-- spacy/en/language_data.py | 24 +- spacy/es/__init__.py | 23 +- spacy/es/language_data.py | 407 +------------------------------ spacy/es/stop_words.py | 84 +++++++ spacy/es/tokenizer_exceptions.py | 318 ++++++++++++++++++++++++ spacy/fr/__init__.py | 23 +- spacy/fr/language_data.py | 109 +-------- spacy/fr/stop_words.py | 88 +++++++ spacy/it/__init__.py | 23 +- spacy/it/language_data.py | 106 +------- spacy/it/stop_words.py | 85 +++++++ spacy/language_data/__init__.py | 1 + spacy/nl/__init__.py | 25 +- spacy/nl/language_data.py | 83 +------ spacy/nl/stop_words.py | 43 ++++ spacy/pt/__init__.py | 23 +- spacy/pt/language_data.py | 87 +------ spacy/pt/stop_words.py | 66 +++++ 21 files changed, 760 insertions(+), 927 deletions(-) create mode 100644 spacy/es/stop_words.py create mode 100644 spacy/es/tokenizer_exceptions.py create mode 100644 spacy/fr/stop_words.py create mode 100644 spacy/it/stop_words.py create mode 100644 spacy/nl/stop_words.py create mode 100644 spacy/pt/stop_words.py diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index 2e7cba4b2..3143a5cd4 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -5,25 +5,8 @@ from os import path from ..language import Language from ..attrs import LANG -from . import language_data -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class German(Language): @@ -35,8 +18,5 @@ class German(Language): lex_attr_getters[LANG] = lambda text: 'de' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index 4fefdb6a6..f64c915f6 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -1,12 +1,21 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY + + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + + +__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS"] diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 623585f7d..2ac839120 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -4,35 +4,12 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data -from .. import util from ..lemmatizer import Lemmatizer from ..vocab import Vocab from ..tokenizer import Tokenizer from ..attrs import LANG -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import expand_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY -from .language_data import get_time_exc - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) -update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) -update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) - +from .language_data import * class English(Language): lang = 'en' @@ -42,8 +19,5 @@ class English(Language): lex_attr_getters[LANG] = lambda text: 'en' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 1b0ba47df..0bfbe13b7 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -1,13 +1,9 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import ENT_ID -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES -from ..language_data import ENTITY_RULES, FALSE_POSITIVES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc, expand_exc +from ..symbols import ORTH, LEMMA from .tag_map import TAG_MAP from .stop_words import STOP_WORDS @@ -39,3 +35,17 @@ def get_time_exc(hours): {ORTH: "pm", LEMMA: "p.m."} ] return exc + + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) +update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + + +__all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] diff --git a/spacy/es/__init__.py b/spacy/es/__init__.py index f6d54714a..216a60d15 100644 --- a/spacy/es/__init__.py +++ b/spacy/es/__init__.py @@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data from ..attrs import LANG -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class Spanish(Language): @@ -34,8 +17,4 @@ class Spanish(Language): lex_attr_getters[LANG] = lambda text: 'es' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index 344adf59b..90595be82 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -1,408 +1,19 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -TAG_MAP = { -} +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +STOP_WORDS = set(STOP_WORDS) -STOP_WORDS = set(""" -actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí -al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos -antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas -aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo -aseguró asi así atras aun aunque ayer añadió aún +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) -bajo bastante bien breve buen buena buenas bueno buenos -cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con -conmigo conocer conseguimos conseguir considera consideró consigo consigue -consiguen consigues contigo contra cosas creo cual cuales cualquier cuando -cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas -cuánto cuántos cómo - -da dado dan dar de debajo debe deben debido decir dejó del delante demasiado -demás dentro deprisa desde despacio despues después detras detrás dia dias dice -dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día -días dónde - -ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas -empleo en encima encuentra enfrente enseguida entonces entre era eramos eran -eras eres es esa esas ese eso esos esta estaba estaban estado estados estais -estamos estan estar estará estas este esto estos estoy estuvo está están ex -excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste -éstos - -fin final fue fuera fueron fui fuimos - -general gran grandes gueno - -ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer -hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron -hizo horas hoy hubo - -igual incluso indicó informo informó intenta intentais intentamos intentan -intentar intentas intento ir - -junto - -la lado largo las le lejos les llegó lleva llevar lo los luego lugar - -mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi -mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha -muchas mucho muchos muy más mí mía mías mío míos - -nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros -nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca - -ocho os otra otras otro otros - -pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas -poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá -podrán podría podrían poner por porque posible primer primera primero primeros -principalmente pronto propia propias propio propios proximo próximo próximos -pudo pueda puede pueden puedo pues - -qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué - -raras realizado realizar realizó repente respecto - -sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo -según seis ser sera será serán sería señaló si sido siempre siendo siete sigue -siguiente sin sino sobre sois sola solamente solas solo solos somos son soy -soyos su supuesto sus suya suyas suyo sé sí sólo - -tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis -tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda -todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan -trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo -tuyos tú - -ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes -última últimas último últimos - -va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero -vez vosotras vosotros voy vuestra vuestras vuestro vuestros - -ya yo -""".split()) - - -TOKENIZER_EXCEPTIONS = { - "accidentarse": [ - {ORTH: "accidentar", LEMMA: "accidentar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "aceptarlo": [ - {ORTH: "aceptar", LEMMA: "aceptar", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "acompañarla": [ - {ORTH: "acompañar", LEMMA: "acompañar", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "advertirle": [ - {ORTH: "advertir", LEMMA: "advertir", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "al": [ - {ORTH: "a", LEMMA: "a", POS: ADP}, - {ORTH: "el", LEMMA: "el", POS: DET} - ], - - "anunciarnos": [ - {ORTH: "anunciar", LEMMA: "anunciar", POS: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "asegurándole": [ - {ORTH: "asegurando", LEMMA: "asegurar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "considerarle": [ - {ORTH: "considerar", LEMMA: "considerar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "decirle": [ - {ORTH: "decir", LEMMA: "decir", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "decirles": [ - {ORTH: "decir", LEMMA: "decir", POS: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "decirte": [ - {ORTH: "Decir", LEMMA: "decir", POS: AUX}, - {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "dejarla": [ - {ORTH: "dejar", LEMMA: "dejar", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "dejarnos": [ - {ORTH: "dejar", LEMMA: "dejar", POS: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "dejándole": [ - {ORTH: "dejando", LEMMA: "dejar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "del": [ - {ORTH: "de", LEMMA: "de", POS: ADP}, - {ORTH: "el", LEMMA: "el", POS: DET} - ], - - "demostrarles": [ - {ORTH: "demostrar", LEMMA: "demostrar", POS: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "diciéndole": [ - {ORTH: "diciendo", LEMMA: "decir", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "diciéndoles": [ - {ORTH: "diciendo", LEMMA: "decir", POS: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "diferenciarse": [ - {ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX}, - {ORTH: "se", LEMMA: "él", POS: PRON} - ], - - "divirtiéndome": [ - {ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "ensanchándose": [ - {ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "explicarles": [ - {ORTH: "explicar", LEMMA: "explicar", POS: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberla": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberlas": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberlo": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberlos": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberme": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "haberse": [ - {ORTH: "haber", LEMMA: "haber", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "hacerle": [ - {ORTH: "hacer", LEMMA: "hacer", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "hacerles": [ - {ORTH: "hacer", LEMMA: "hacer", POS: AUX}, - {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "hallarse": [ - {ORTH: "hallar", LEMMA: "hallar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "imaginaros": [ - {ORTH: "imaginar", LEMMA: "imaginar", POS: AUX}, - {ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "insinuarle": [ - {ORTH: "insinuar", LEMMA: "insinuar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "justificarla": [ - {ORTH: "justificar", LEMMA: "justificar", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "mantenerlas": [ - {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, - {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "mantenerlos": [ - {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "mantenerme": [ - {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, - {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "pasarte": [ - {ORTH: "pasar", LEMMA: "pasar", POS: AUX}, - {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "pedirle": [ - {ORTH: "pedir", LEMMA: "pedir", POS: AUX}, - {ORTH: "le", LEMMA: "él", POS: PRON} - ], - - "pel": [ - {ORTH: "per", LEMMA: "per", POS: ADP}, - {ORTH: "el", LEMMA: "el", POS: DET} - ], - - "pidiéndonos": [ - {ORTH: "pidiendo", LEMMA: "pedir", POS: AUX}, - {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "poderle": [ - {ORTH: "poder", LEMMA: "poder", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "preguntarse": [ - {ORTH: "preguntar", LEMMA: "preguntar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "preguntándose": [ - {ORTH: "preguntando", LEMMA: "preguntar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "presentarla": [ - {ORTH: "presentar", LEMMA: "presentar", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "pudiéndolo": [ - {ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "pudiéndose": [ - {ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "quererle": [ - {ORTH: "querer", LEMMA: "querer", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "rasgarse": [ - {ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "repetirlo": [ - {ORTH: "repetir", LEMMA: "repetir", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "robarle": [ - {ORTH: "robar", LEMMA: "robar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "seguirlos": [ - {ORTH: "seguir", LEMMA: "seguir", POS: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "serle": [ - {ORTH: "ser", LEMMA: "ser", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "serlo": [ - {ORTH: "ser", LEMMA: "ser", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "señalándole": [ - {ORTH: "señalando", LEMMA: "señalar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "suplicarle": [ - {ORTH: "suplicar", LEMMA: "suplicar", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "tenerlos": [ - {ORTH: "tener", LEMMA: "tener", POS: AUX}, - {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "vengarse": [ - {ORTH: "vengar", LEMMA: "vengar", POS: AUX}, - {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "verla": [ - {ORTH: "ver", LEMMA: "ver", POS: AUX}, - {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "verle": [ - {ORTH: "ver", LEMMA: "ver", POS: AUX}, - {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} - ], - - "volverlo": [ - {ORTH: "volver", LEMMA: "volver", POS: AUX}, - {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} - ] -} - - -ORTH_ONLY = [ - -] +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/es/stop_words.py b/spacy/es/stop_words.py new file mode 100644 index 000000000..8dc4f11e0 --- /dev/null +++ b/spacy/es/stop_words.py @@ -0,0 +1,84 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +actualmente acuerdo adelante ademas además adrede afirmó agregó ahi ahora ahí +al algo alguna algunas alguno algunos algún alli allí alrededor ambos ampleamos +antano antaño ante anterior antes apenas aproximadamente aquel aquella aquellas +aquello aquellos aqui aquél aquélla aquéllas aquéllos aquí arriba arribaabajo +aseguró asi así atras aun aunque ayer añadió aún + +bajo bastante bien breve buen buena buenas bueno buenos + +cada casi cerca cierta ciertas cierto ciertos cinco claro comentó como con +conmigo conocer conseguimos conseguir considera consideró consigo consigue +consiguen consigues contigo contra cosas creo cual cuales cualquier cuando +cuanta cuantas cuanto cuantos cuatro cuenta cuál cuáles cuándo cuánta cuántas +cuánto cuántos cómo + +da dado dan dar de debajo debe deben debido decir dejó del delante demasiado +demás dentro deprisa desde despacio despues después detras detrás dia dias dice +dicen dicho dieron diferente diferentes dijeron dijo dio donde dos durante día +días dónde + +ejemplo el ella ellas ello ellos embargo empleais emplean emplear empleas +empleo en encima encuentra enfrente enseguida entonces entre era eramos eran +eras eres es esa esas ese eso esos esta estaba estaban estado estados estais +estamos estan estar estará estas este esto estos estoy estuvo está están ex +excepto existe existen explicó expresó él ésa ésas ése ésos ésta éstas éste +éstos + +fin final fue fuera fueron fui fuimos + +general gran grandes gueno + +ha haber habia habla hablan habrá había habían hace haceis hacemos hacen hacer +hacerlo haces hacia haciendo hago han hasta hay haya he hecho hemos hicieron +hizo horas hoy hubo + +igual incluso indicó informo informó intenta intentais intentamos intentan +intentar intentas intento ir + +junto + +la lado largo las le lejos les llegó lleva llevar lo los luego lugar + +mal manera manifestó mas mayor me mediante medio mejor mencionó menos menudo mi +mia mias mientras mio mios mis misma mismas mismo mismos modo momento mucha +muchas mucho muchos muy más mí mía mías mío míos + +nada nadie ni ninguna ningunas ninguno ningunos ningún no nos nosotras nosotros +nuestra nuestras nuestro nuestros nueva nuevas nuevo nuevos nunca + +ocho os otra otras otro otros + +pais para parece parte partir pasada pasado paìs peor pero pesar poca pocas +poco pocos podeis podemos poder podria podriais podriamos podrian podrias podrá +podrán podría podrían poner por porque posible primer primera primero primeros +principalmente pronto propia propias propio propios proximo próximo próximos +pudo pueda puede pueden puedo pues + +qeu que quedó queremos quien quienes quiere quiza quizas quizá quizás quién quiénes qué + +raras realizado realizar realizó repente respecto + +sabe sabeis sabemos saben saber sabes salvo se sea sean segun segunda segundo +según seis ser sera será serán sería señaló si sido siempre siendo siete sigue +siguiente sin sino sobre sois sola solamente solas solo solos somos son soy +soyos su supuesto sus suya suyas suyo sé sí sólo + +tal tambien también tampoco tan tanto tarde te temprano tendrá tendrán teneis +tenemos tener tenga tengo tenido tenía tercera ti tiempo tiene tienen toda +todas todavia todavía todo todos total trabaja trabajais trabajamos trabajan +trabajar trabajas trabajo tras trata través tres tu tus tuvo tuya tuyas tuyo +tuyos tú + +ultimo un una unas uno unos usa usais usamos usan usar usas uso usted ustedes +última últimas último últimos + +va vais valor vamos van varias varios vaya veces ver verdad verdadera verdadero +vez vosotras vosotros voy vuestra vuestras vuestro vuestros + +ya yo +""".split()) diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py new file mode 100644 index 000000000..36a2a8d23 --- /dev/null +++ b/spacy/es/tokenizer_exceptions.py @@ -0,0 +1,318 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "accidentarse": [ + {ORTH: "accidentar", LEMMA: "accidentar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "aceptarlo": [ + {ORTH: "aceptar", LEMMA: "aceptar", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "acompañarla": [ + {ORTH: "acompañar", LEMMA: "acompañar", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "advertirle": [ + {ORTH: "advertir", LEMMA: "advertir", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "al": [ + {ORTH: "a", LEMMA: "a", POS: ADP}, + {ORTH: "el", LEMMA: "el", POS: DET} + ], + + "anunciarnos": [ + {ORTH: "anunciar", LEMMA: "anunciar", POS: AUX}, + {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "asegurándole": [ + {ORTH: "asegurando", LEMMA: "asegurar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "considerarle": [ + {ORTH: "considerar", LEMMA: "considerar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "decirle": [ + {ORTH: "decir", LEMMA: "decir", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "decirles": [ + {ORTH: "decir", LEMMA: "decir", POS: AUX}, + {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "decirte": [ + {ORTH: "Decir", LEMMA: "decir", POS: AUX}, + {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "dejarla": [ + {ORTH: "dejar", LEMMA: "dejar", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "dejarnos": [ + {ORTH: "dejar", LEMMA: "dejar", POS: AUX}, + {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "dejándole": [ + {ORTH: "dejando", LEMMA: "dejar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "del": [ + {ORTH: "de", LEMMA: "de", POS: ADP}, + {ORTH: "el", LEMMA: "el", POS: DET} + ], + + "demostrarles": [ + {ORTH: "demostrar", LEMMA: "demostrar", POS: AUX}, + {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "diciéndole": [ + {ORTH: "diciendo", LEMMA: "decir", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "diciéndoles": [ + {ORTH: "diciendo", LEMMA: "decir", POS: AUX}, + {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "diferenciarse": [ + {ORTH: "diferenciar", LEMMA: "diferenciar", POS: AUX}, + {ORTH: "se", LEMMA: "él", POS: PRON} + ], + + "divirtiéndome": [ + {ORTH: "divirtiendo", LEMMA: "divertir", POS: AUX}, + {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "ensanchándose": [ + {ORTH: "ensanchando", LEMMA: "ensanchar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "explicarles": [ + {ORTH: "explicar", LEMMA: "explicar", POS: AUX}, + {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberla": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberlas": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberlo": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberlos": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberme": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "haberse": [ + {ORTH: "haber", LEMMA: "haber", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "hacerle": [ + {ORTH: "hacer", LEMMA: "hacer", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "hacerles": [ + {ORTH: "hacer", LEMMA: "hacer", POS: AUX}, + {ORTH: "les", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "hallarse": [ + {ORTH: "hallar", LEMMA: "hallar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "imaginaros": [ + {ORTH: "imaginar", LEMMA: "imaginar", POS: AUX}, + {ORTH: "os", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "insinuarle": [ + {ORTH: "insinuar", LEMMA: "insinuar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "justificarla": [ + {ORTH: "justificar", LEMMA: "justificar", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "mantenerlas": [ + {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, + {ORTH: "las", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "mantenerlos": [ + {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, + {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "mantenerme": [ + {ORTH: "mantener", LEMMA: "mantener", POS: AUX}, + {ORTH: "me", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "pasarte": [ + {ORTH: "pasar", LEMMA: "pasar", POS: AUX}, + {ORTH: "te", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "pedirle": [ + {ORTH: "pedir", LEMMA: "pedir", POS: AUX}, + {ORTH: "le", LEMMA: "él", POS: PRON} + ], + + "pel": [ + {ORTH: "per", LEMMA: "per", POS: ADP}, + {ORTH: "el", LEMMA: "el", POS: DET} + ], + + "pidiéndonos": [ + {ORTH: "pidiendo", LEMMA: "pedir", POS: AUX}, + {ORTH: "nos", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "poderle": [ + {ORTH: "poder", LEMMA: "poder", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "preguntarse": [ + {ORTH: "preguntar", LEMMA: "preguntar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "preguntándose": [ + {ORTH: "preguntando", LEMMA: "preguntar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "presentarla": [ + {ORTH: "presentar", LEMMA: "presentar", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "pudiéndolo": [ + {ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "pudiéndose": [ + {ORTH: "pudiendo", LEMMA: "poder", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "quererle": [ + {ORTH: "querer", LEMMA: "querer", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "rasgarse": [ + {ORTH: "Rasgar", LEMMA: "rasgar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "repetirlo": [ + {ORTH: "repetir", LEMMA: "repetir", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "robarle": [ + {ORTH: "robar", LEMMA: "robar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "seguirlos": [ + {ORTH: "seguir", LEMMA: "seguir", POS: AUX}, + {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "serle": [ + {ORTH: "ser", LEMMA: "ser", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "serlo": [ + {ORTH: "ser", LEMMA: "ser", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "señalándole": [ + {ORTH: "señalando", LEMMA: "señalar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "suplicarle": [ + {ORTH: "suplicar", LEMMA: "suplicar", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "tenerlos": [ + {ORTH: "tener", LEMMA: "tener", POS: AUX}, + {ORTH: "los", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "vengarse": [ + {ORTH: "vengar", LEMMA: "vengar", POS: AUX}, + {ORTH: "se", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "verla": [ + {ORTH: "ver", LEMMA: "ver", POS: AUX}, + {ORTH: "la", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "verle": [ + {ORTH: "ver", LEMMA: "ver", POS: AUX}, + {ORTH: "le", LEMMA: PRON_LEMMA, POS: PRON} + ], + + "volverlo": [ + {ORTH: "volver", LEMMA: "volver", POS: AUX}, + {ORTH: "lo", LEMMA: PRON_LEMMA, POS: PRON} + ] +} + + +ORTH_ONLY = [ + +] diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index 9f259b1b5..81584b926 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data from ..attrs import LANG -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class French(Language): @@ -34,8 +17,4 @@ class French(Language): lex_attr_getters[LANG] = lambda text: 'fr' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index b35ecfd71..e612fe064 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -1,109 +1,14 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import strings_to_exc + +from .stop_words import STOP_WORDS -TAG_MAP = { - -} +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +STOP_WORDS = set(STOP_WORDS) -STOP_WORDS = set(""" -a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons -allô alors anterieur anterieure anterieures apres après as assez attendu au -aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront -aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient -avais avait avant avec avoir avons ayant - -bah bas basee bat beau beaucoup bien bigre boum bravo brrr - -ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui -celui-ci celui-là cent cependant certain certaine certaines certains certes ces -cet cette ceux ceux-ci ceux-là chacun chacune chaque cher chers chez chiche -chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac -clic combien comme comment comparable comparables compris concernant contre -couic crac - -da dans de debout dedans dehors deja delà depuis dernier derniere derriere -derrière des desormais desquelles desquels dessous dessus deux deuxième -deuxièmement devant devers devra different differentes differents différent -différente différentes différents dire directe directement dit dite dits divers -diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont -douze douzième dring du duquel durant dès désormais - -effet egale egalement egales eh elle elle-même elles elles-mêmes en encore -enfin entre envers environ es ès est et etaient étaient etais étais etait était -etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso -exterieur - -fais faisaient faisant fait façon feront fi flac floc font - -gens - -ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum -hurrah hé hélas i il ils importe - -je jusqu jusque juste - -la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps -lors lorsque lui lui-meme lui-même là lès - -ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien -mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins -mon moyennant multiple multiples même mêmes - -na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf -neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau -nul néanmoins nôtre nôtres - -o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre -ouvert ouverte ouverts où - -paf pan par parce parfois parle parlent parler parmi parseme partant -particulier particulière particulièrement pas passé pendant pense permet -personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus -plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi -pourrais pourrait pouvait prealable precisement premier première premièrement -pres probable probante procedant proche près psitt pu puis puisque pur pure - -qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt -quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque -quelques quels qui quiconque quinze quoi quoique - -rare rarement rares relative relativement remarquable rend rendre restant reste -restent restrictif retour revoici revoilà rien - -sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient -semble semblent sent sept septième sera seraient serait seront ses seul seule -seulement si sien sienne siennes siens sinon six sixième soi soi-même soit -soixante son sont sous souvent specifique specifiques speculatif stop -strictement subtiles suffisant suffisante suffit suis suit suivant suivante -suivantes suivants suivre superpose sur surtout - -ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente -tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous -tout toute toutefois toutes treize trente tres trois troisième troisièmement -trop très tsoin tsouin tu té - -un une unes uniformement unique uniques uns - -va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos -votre vous vous-mêmes vu vé vôtre vôtres - -zut -""".split()) - - -TOKENIZER_EXCEPTIONS = { - -} - - -ORTH_ONLY = { - -} +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/fr/stop_words.py b/spacy/fr/stop_words.py new file mode 100644 index 000000000..52e4f6f0c --- /dev/null +++ b/spacy/fr/stop_words.py @@ -0,0 +1,88 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a à â abord absolument afin ah ai aie ailleurs ainsi ait allaient allo allons +allô alors anterieur anterieure anterieures apres après as assez attendu au +aucun aucune aujourd aujourd'hui aupres auquel aura auraient aurait auront +aussi autre autrefois autrement autres autrui aux auxquelles auxquels avaient +avais avait avant avec avoir avons ayant + +bah bas basee bat beau beaucoup bien bigre boum bravo brrr + +ça car ce ceci cela celle celle-ci celle-là celles celles-ci celles-là celui +celui-ci celui-là cent cependant certain certaine certaines certains certes ces +cet cette ceux ceux-ci ceux-là chacun chacune chaque cher chers chez chiche +chut chère chères ci cinq cinquantaine cinquante cinquantième cinquième clac +clic combien comme comment comparable comparables compris concernant contre +couic crac + +da dans de debout dedans dehors deja delà depuis dernier derniere derriere +derrière des desormais desquelles desquels dessous dessus deux deuxième +deuxièmement devant devers devra different differentes differents différent +différente différentes différents dire directe directement dit dite dits divers +diverse diverses dix dix-huit dix-neuf dix-sept dixième doit doivent donc dont +douze douzième dring du duquel durant dès désormais + +effet egale egalement egales eh elle elle-même elles elles-mêmes en encore +enfin entre envers environ es ès est et etaient étaient etais étais etait était +etant étant etc été etre être eu euh eux eux-mêmes exactement excepté extenso +exterieur + +fais faisaient faisant fait façon feront fi flac floc font + +gens + +ha hein hem hep hi ho holà hop hormis hors hou houp hue hui huit huitième hum +hurrah hé hélas i il ils importe + +je jusqu jusque juste + +la laisser laquelle las le lequel les lesquelles lesquels leur leurs longtemps +lors lorsque lui lui-meme lui-même là lès + +ma maint maintenant mais malgre malgré maximale me meme memes merci mes mien +mienne miennes miens mille mince minimale moi moi-meme moi-même moindres moins +mon moyennant multiple multiples même mêmes + +na naturel naturelle naturelles ne neanmoins necessaire necessairement neuf +neuvième ni nombreuses nombreux non nos notamment notre nous nous-mêmes nouveau +nul néanmoins nôtre nôtres + +o ô oh ohé ollé olé on ont onze onzième ore ou ouf ouias oust ouste outre +ouvert ouverte ouverts où + +paf pan par parce parfois parle parlent parler parmi parseme partant +particulier particulière particulièrement pas passé pendant pense permet +personne peu peut peuvent peux pff pfft pfut pif pire plein plouf plus +plusieurs plutôt possessif possessifs possible possibles pouah pour pourquoi +pourrais pourrait pouvait prealable precisement premier première premièrement +pres probable probante procedant proche près psitt pu puis puisque pur pure + +qu quand quant quant-à-soi quanta quarante quatorze quatre quatre-vingt +quatrième quatrièmement que quel quelconque quelle quelles quelqu'un quelque +quelques quels qui quiconque quinze quoi quoique + +rare rarement rares relative relativement remarquable rend rendre restant reste +restent restrictif retour revoici revoilà rien + +sa sacrebleu sait sans sapristi sauf se sein seize selon semblable semblaient +semble semblent sent sept septième sera seraient serait seront ses seul seule +seulement si sien sienne siennes siens sinon six sixième soi soi-même soit +soixante son sont sous souvent specifique specifiques speculatif stop +strictement subtiles suffisant suffisante suffit suis suit suivant suivante +suivantes suivants suivre superpose sur surtout + +ta tac tant tardive te tel telle tellement telles tels tenant tend tenir tente +tes tic tien tienne tiennes tiens toc toi toi-même ton touchant toujours tous +tout toute toutefois toutes treize trente tres trois troisième troisièmement +trop très tsoin tsouin tu té + +un une unes uniformement unique uniques uns + +va vais vas vers via vif vifs vingt vivat vive vives vlan voici voilà vont vos +votre vous vous-mêmes vu vé vôtre vôtres + +zut +""".split()) diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index cc3d9143f..2ef60fd94 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data from ..attrs import LANG -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class Italian(Language): @@ -34,8 +17,4 @@ class Italian(Language): lex_attr_getters[LANG] = lambda text: 'it' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py index d47be449b..8683f83ac 100644 --- a/spacy/it/language_data.py +++ b/spacy/it/language_data.py @@ -1,106 +1,14 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS -TAG_MAP = { - -} +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +STOP_WORDS = set(STOP_WORDS) -STOP_WORDS = set(""" -a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl -agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri -altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai -attesa attraverso avanti avemmo avendo avente aver avere averlo avesse -avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate -avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste -avresti avrete avrà avrò avuta avute avuti avuto - -basta bene benissimo brava bravo - -casa caso cento certa certe certi certo che chi chicchessia chiunque ci -ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto -cogli coi col colei coll coloro colui come cominci comunque con concernente -conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui - -da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli -dei del dell della delle dello dentro detto deve di dice dietro dire -dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due -dunque durante - -ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era -erano eravamo eravate eri ero esempio esse essendo esser essere essi ex - -fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero -facessi facessimo faceste facesti faceva facevamo facevano facevate facevi -facevo fai fanno farai faranno fare farebbe farebbero farei faremmo faremo -fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente -finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra -frattempo fu fui fummo fuori furono futuro generale - -gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo -grande grazie gruppo - -ha haha hai hanno ho - -ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io - -la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo - -ma macche magari maggior mai male malgrado malissimo mancanza marche me -medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi -milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto - -nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun -nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre -nostri nostro novanta nove nulla nuovo - -od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto - -paese parecchi parecchie parecchio parte partendo peccato peggio per perche -perché percio perciò perfino pero persino persone però piedi pieno piglia piu -piuttosto più po pochissimo poco poi poiche possa possedere posteriore posto -potrebbe preferibilmente presa press prima primo principalmente probabilmente -proprio puo può pure purtroppo - -qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante -quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest -questa queste questi questo qui quindi - -realmente recente recentemente registrazione relativo riecco salvo - -sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste -saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei -sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate -siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando -stanno starai staranno starebbe starebbero starei staremmo staremo stareste -staresti starete starà starò stata state stati stato stava stavamo stavano -stavate stavi stavo stemmo stessa stesse stessero stessi stessimo stesso -steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua -subito successivamente successivo sue sugl sugli sui sul sull sulla sulle -sullo suo suoi - -tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta -troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto - -uguali ulteriore ultimo un una uno uomo - -va vale vari varia varie vario verso vi via vicino visto vita voi volta volte -vostra vostre vostri vostro -""".split()) - - -TOKENIZER_EXCEPTIONS = { - -} - - -ORTH_ONLY = { - -} +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/it/stop_words.py b/spacy/it/stop_words.py new file mode 100644 index 000000000..cf5697514 --- /dev/null +++ b/spacy/it/stop_words.py @@ -0,0 +1,85 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a abbastanza abbia abbiamo abbiano abbiate accidenti ad adesso affinche agl +agli ahime ahimè ai al alcuna alcuni alcuno all alla alle allo allora altri +altrimenti altro altrove altrui anche ancora anni anno ansa anticipo assai +attesa attraverso avanti avemmo avendo avente aver avere averlo avesse +avessero avessi avessimo aveste avesti avete aveva avevamo avevano avevate +avevi avevo avrai avranno avrebbe avrebbero avrei avremmo avremo avreste +avresti avrete avrà avrò avuta avute avuti avuto + +basta bene benissimo brava bravo + +casa caso cento certa certe certi certo che chi chicchessia chiunque ci +ciascuna ciascuno cima cio cioe circa citta città co codesta codesti codesto +cogli coi col colei coll coloro colui come cominci comunque con concernente +conciliarsi conclusione consiglio contro cortesia cos cosa cosi così cui + +da dagl dagli dai dal dall dalla dalle dallo dappertutto davanti degl degli +dei del dell della delle dello dentro detto deve di dice dietro dire +dirimpetto diventa diventare diventato dopo dov dove dovra dovrà dovunque due +dunque durante + +ebbe ebbero ebbi ecc ecco ed effettivamente egli ella entrambi eppure era +erano eravamo eravate eri ero esempio esse essendo esser essere essi ex + +fa faccia facciamo facciano facciate faccio facemmo facendo facesse facessero +facessi facessimo faceste facesti faceva facevamo facevano facevate facevi +facevo fai fanno farai faranno fare farebbe farebbero farei faremmo faremo +fareste faresti farete farà farò fatto favore fece fecero feci fin finalmente +finche fine fino forse forza fosse fossero fossi fossimo foste fosti fra +frattempo fu fui fummo fuori furono futuro generale + +gia già giacche giorni giorno gli gliela gliele glieli glielo gliene governo +grande grazie gruppo + +ha haha hai hanno ho + +ieri il improvviso in inc infatti inoltre insieme intanto intorno invece io + +la là lasciato lato lavoro le lei li lo lontano loro lui lungo luogo + +ma macche magari maggior mai male malgrado malissimo mancanza marche me +medesimo mediante meglio meno mentre mesi mezzo mi mia mie miei mila miliardi +milioni minimi ministro mio modo molti moltissimo molto momento mondo mosto + +nazionale ne negl negli nei nel nell nella nelle nello nemmeno neppure nessun +nessuna nessuno niente no noi non nondimeno nonostante nonsia nostra nostre +nostri nostro novanta nove nulla nuovo + +od oggi ogni ognuna ognuno oltre oppure ora ore osi ossia ottanta otto + +paese parecchi parecchie parecchio parte partendo peccato peggio per perche +perché percio perciò perfino pero persino persone però piedi pieno piglia piu +piuttosto più po pochissimo poco poi poiche possa possedere posteriore posto +potrebbe preferibilmente presa press prima primo principalmente probabilmente +proprio puo può pure purtroppo + +qualche qualcosa qualcuna qualcuno quale quali qualunque quando quanta quante +quanti quanto quantunque quasi quattro quel quella quelle quelli quello quest +questa queste questi questo qui quindi + +realmente recente recentemente registrazione relativo riecco salvo + +sara sarà sarai saranno sarebbe sarebbero sarei saremmo saremo sareste +saresti sarete saro sarò scola scopo scorso se secondo seguente seguito sei +sembra sembrare sembrato sembri sempre senza sette si sia siamo siano siate +siete sig solito solo soltanto sono sopra sotto spesso srl sta stai stando +stanno starai staranno starebbe starebbero starei staremmo staremo stareste +staresti starete starà starò stata state stati stato stava stavamo stavano +stavate stavi stavo stemmo stessa stesse stessero stessi stessimo stesso +steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua +subito successivamente successivo sue sugl sugli sui sul sull sulla sulle +sullo suo suoi + +tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta +troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto + +uguali ulteriore ultimo un una uno uomo + +va vale vari varia varie vario verso vi via vicino visto vita voi volta volte +vostra vostre vostri vostro +""".split()) diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index 3aba785c2..f6aa4317c 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,4 +1,5 @@ from .emoticons import * from .punctuation import * +from .tag_map import * from .entity_rules import * from .util import * diff --git a/spacy/nl/__init__.py b/spacy/nl/__init__.py index 9c6d4af7d..d958783ea 100644 --- a/spacy/nl/__init__.py +++ b/spacy/nl/__init__.py @@ -4,39 +4,16 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data from ..attrs import LANG - -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class Dutch(Language): lang = 'nl' class Defaults(Language.Defaults): - tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nl' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index 7fa3b247d..8683f83ac 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -1,83 +1,14 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS -# TODO insert TAG_MAP for Dutch - -TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} -} +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +STOP_WORDS = set(STOP_WORDS) -# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt - -STOP_WORDS = set(""" -aan af al alles als altijd andere - -ben bij - -daar dan dat de der deze die dit doch doen door dus - -een eens en er - -ge geen geweest - -haar had heb hebben heeft hem het hier hij hoe hun - -iemand iets ik in is - -ja je - -kan kon kunnen - -maar me meer men met mij mijn moet - -na naar niet niets nog nu - -of om omdat ons ook op over - -reeds - -te tegen toch toen tot - -u uit uw - -van veel voor - -want waren was wat we wel werd wezen wie wij wil worden - -zal ze zei zelf zich zij zijn zo zonder zou -""".split()) - - -# TODO Make tokenizer excpetions for Dutch - -TOKENIZER_EXCEPTIONS = { - -} - - -ORTH_ONLY = { - -} +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/nl/stop_words.py b/spacy/nl/stop_words.py new file mode 100644 index 000000000..bef6871b2 --- /dev/null +++ b/spacy/nl/stop_words.py @@ -0,0 +1,43 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt + +STOP_WORDS = set(""" +aan af al alles als altijd andere + +ben bij + +daar dan dat de der deze die dit doch doen door dus + +een eens en er + +ge geen geweest + +haar had heb hebben heeft hem het hier hij hoe hun + +iemand iets ik in is + +ja je + +kan kon kunnen + +maar me meer men met mij mijn moet + +na naar niet niets nog nu + +of om omdat ons ook op over + +reeds + +te tegen toch toen tot + +u uit uw + +van veel voor + +want waren was wat we wel werd wezen wie wij wil worden + +zal ze zei zelf zich zij zijn zo zonder zou +""".split()) diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py index 5381b1926..06c6417dc 100644 --- a/spacy/pt/__init__.py +++ b/spacy/pt/__init__.py @@ -4,26 +4,9 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from . import language_data from ..attrs import LANG -from ..language_data import update_exc -from ..language_data import strings_to_exc -from ..language_data import EMOTICONS - -from .language_data import ORTH_ONLY - - -TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) -TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) -TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) -TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) -TAG_MAP = dict(language_data.TAG_MAP) -STOP_WORDS = set(language_data.STOP_WORDS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +from .language_data import * class Portuguese(Language): @@ -34,8 +17,4 @@ class Portuguese(Language): lex_attr_getters[LANG] = lambda text: 'pt' tokenizer_exceptions = TOKENIZER_EXCEPTIONS - prefixes = TOKENIZER_PREFIXES - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES - tag_map = TAG_MAP stop_words = STOP_WORDS diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index a7379615c..8683f83ac 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -1,87 +1,14 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * -from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES +from .. import language_data as base +from ..language_data import update_exc, strings_to_exc + +from .stop_words import STOP_WORDS -TAG_MAP = { - -} +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +STOP_WORDS = set(STOP_WORDS) -STOP_WORDS = set(""" -à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano -anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui -aquilo area área as assim através atrás até aí - -baixo bastante bem bom breve - -cada caminho catorze cedo cento certamente certeza cima cinco coisa com como -comprido conhecido conselho contra corrente custa cá - -da daquela daquele dar das de debaixo demais dentro depois desde desligado -dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete -dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida - -é ela elas ele eles em embora enquanto entre então era és essa essas esse esses -esta estado estar estará estas estava este estes esteve estive estivemos -estiveram estiveste estivestes estou está estás estão eu exemplo - -falta fará favor faz fazeis fazem fazemos fazer fazes fazia faço fez fim final -foi fomos for fora foram forma foste fostes fui - -geral grande grandes grupo - -hoje horas há - -iniciar inicio ir irá isso ista iste isto já - -lado ligado local logo longe lugar lá - -maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus -mil minha minhas momento muito muitos máximo mês - -na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome -nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós -número - -obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras -outro outros - -para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem -poder poderá podia ponto pontos por porque porquê posição possivelmente posso -possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam -pôde põe põem - -qual qualquer quando quanto quarta quarto quatro que quem quer quero questão -quieto quinta quinto quinze quê relação - -sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta -sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo - -tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar -tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos -tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu -tua tuas tudo tão têm - -último um uma umas uns usa usar - -vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo -vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós - -zero -""".split()) - - -TOKENIZER_EXCEPTIONS = { - -} - - -ORTH_ONLY = { - -} +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py new file mode 100644 index 000000000..d0008457c --- /dev/null +++ b/spacy/pt/stop_words.py @@ -0,0 +1,66 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano +anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui +aquilo area área as assim através atrás até aí + +baixo bastante bem bom breve + +cada caminho catorze cedo cento certamente certeza cima cinco coisa com como +comprido conhecido conselho contra corrente custa cá + +da daquela daquele dar das de debaixo demais dentro depois desde desligado +dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete +dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida + +é ela elas ele eles em embora enquanto entre então era és essa essas esse esses +esta estado estar estará estas estava este estes esteve estive estivemos +estiveram estiveste estivestes estou está estás estão eu exemplo + +falta fará favor faz fazeis fazem fazemos fazer fazes fazia faço fez fim final +foi fomos for fora foram forma foste fostes fui + +geral grande grandes grupo + +hoje horas há + +iniciar inicio ir irá isso ista iste isto já + +lado ligado local logo longe lugar lá + +maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus +mil minha minhas momento muito muitos máximo mês + +na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome +nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós +número + +obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras +outro outros + +para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem +poder poderá podia ponto pontos por porque porquê posição possivelmente posso +possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam +pôde põe põem + +qual qualquer quando quanto quarta quarto quatro que quem quer quero questão +quieto quinta quinto quinze quê relação + +sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta +sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo + +tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar +tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos +tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu +tua tuas tudo tão têm + +último um uma umas uns usa usar + +vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo +vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós + +zero +""".split()) From 4e95737c6cfb64c2dd4f24f2ab0ba3c030051090 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:54:28 +0100 Subject: [PATCH 27/31] Add base tag map --- spacy/language_data/tag_map.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 spacy/language_data/tag_map.py diff --git a/spacy/language_data/tag_map.py b/spacy/language_data/tag_map.py new file mode 100644 index 000000000..f5b6b5040 --- /dev/null +++ b/spacy/language_data/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} From bcc1d50d09dcaee958d3e76146aa1987a7c51706 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:54:52 +0100 Subject: [PATCH 28/31] Remove trailing whitespace --- spacy/language.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 78dbac953..a95146eb2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -53,7 +53,7 @@ class BaseDefaults(object): else: return Vocab.load(nlp.path, lex_attr_getters=cls.lex_attr_getters, tag_map=cls.tag_map, lemmatizer=lemmatizer) - + @classmethod def add_vectors(cls, nlp=None): if nlp is None or nlp.path is None: @@ -150,9 +150,9 @@ class BaseDefaults(object): tag_map = {} tokenizer_exceptions = {} - + parser_features = get_templates('parser') - + entity_features = get_templates('ner') tagger_features = Tagger.feature_templates # TODO -- fix this @@ -257,7 +257,7 @@ class Language(object): path = util.match_best_version(self.lang, '', util.get_data_path()) self.path = path - + self.vocab = self.Defaults.create_vocab(self) \ if 'vocab' not in overrides \ else overrides['vocab'] @@ -299,7 +299,7 @@ class Language(object): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. - + Args: text (unicode): The text to be processed. @@ -327,9 +327,9 @@ class Language(object): def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): '''Process texts as a stream, and yield Doc objects in order. - + Supports GIL-free multi-threading. - + Arguments: texts (iterator) tag (bool) @@ -352,7 +352,7 @@ class Language(object): path = self.path elif isinstance(path, basestring): path = pathlib.Path(path) - + if self.tagger: self.tagger.model.end_training() self.tagger.model.dump(str(path / 'pos' / 'model')) @@ -362,7 +362,7 @@ class Language(object): if self.entity: self.entity.model.end_training() self.entity.model.dump(str(path / 'ner' / 'model')) - + strings_loc = path / 'vocab' / 'strings.json' with strings_loc.open('w', encoding='utf8') as file_: self.vocab.strings.dump(file_) From 753068f1d533076da72b9b52c8b65d2883eccd3c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:55:25 +0100 Subject: [PATCH 29/31] Use base language data as default --- spacy/language.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index a95146eb2..222aadf16 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -21,6 +21,7 @@ from .matcher import Matcher from . import attrs from . import orth from . import util +from . import language_data from .lemmatizer import Lemmatizer from .train import Trainer @@ -140,14 +141,14 @@ class BaseDefaults(object): if nlp.entity: pipeline.append(nlp.entity) return pipeline - - prefixes = tuple() - suffixes = tuple() + prefixes = tuple(language_data.TOKENIZER_PREFIXES) - infixes = tuple() - - tag_map = {} + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) tokenizer_exceptions = {} From d1c1d3f9cdabc33df93e534da7e55d2e9aaf159e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:55:32 +0100 Subject: [PATCH 30/31] Fix tokenizer test --- spacy/tests/tokenizer/test_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index d4330e3ce..091561ae3 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -8,9 +8,9 @@ import cloudpickle import tempfile from ... import util -from ...en.language_data import TOKENIZER_PREFIXES as EN_TOKENIZER_PREFIXES +from ...language_data import TOKENIZER_PREFIXES -en_search_prefixes = util.compile_prefix_regex(EN_TOKENIZER_PREFIXES).search +en_search_prefixes = util.compile_prefix_regex(TOKENIZER_PREFIXES).search # @pytest.mark.xfail # def test_pickle(en_tokenizer): From b99d683a931d4c1659a9bb0194c64640ca39b776 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 16:58:28 +0100 Subject: [PATCH 31/31] Fix formatting --- spacy/en/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index b19e49a36..24506c145 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -11,6 +11,7 @@ from ..attrs import LANG from .language_data import * + class English(Language): lang = 'en'