mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Merge branch 'master' of https://github.com/spacy-io/spaCy
This commit is contained in:
commit
80134eb12d
|
@ -1,7 +1,5 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
spaCy Natural Language Processing Tools
|
||||
|
||||
Copyright (C) 2015 Matthew Honnibal
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
@ -1 +1,4 @@
|
|||
recursive-include include *.h
|
||||
include buildbot.json
|
||||
include LICENSE
|
||||
include README.rst
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
Python 2.7 Windows build has been tested with the following toolchain:
|
||||
- Python 2.7.10 :)
|
||||
- Microsoft Visual C++ Compiler Package for Python 2.7 http://www.microsoft.com/en-us/download/details.aspx?id=44266
|
||||
- C99 compliant stdint.h for MSVC https://msinttypes.googlecode.com/svn/trunk/stdint.h
|
||||
(C99 complian stdint.h header which is not supplied with Microsoft Visual C++ compiler prior to MSVC 2010)
|
||||
|
||||
Build steps:
|
||||
- pip install --upgrade setuptools
|
||||
- pip install cython fabric fabtools
|
||||
- pip install -r requirements.txt
|
||||
- python pip install -e .
|
||||
|
||||
If you are using traditional Microsoft SDK (v7.0 for Python 2.x or v7.1 for Python 3.x) consider run_with_env.cmd from appveyor folder (submodule) as a guideline for environment setup.
|
||||
It can be also used as shell conviguration script for your build, install and run commands, i.e.: cmd /E:ON /V:ON /C run_with_env.cmd <your command>
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
[![Travis CI status](https://travis-ci.org/spacy-io/spaCy.svg?branch=master)](https://travis-ci.org/spacy-io/spaCy)
|
||||
|
||||
.. image:: https://travis-ci.org/spacy-io/spaCy.svg?branch=master
|
||||
:target: https://travis-ci.org/spacy-io/spaCy
|
||||
|
||||
==============================
|
||||
spaCy: Industrial-strength NLP
|
||||
==============================
|
||||
|
||||
|
@ -27,7 +28,7 @@ Features
|
|||
* No pre-processing required. spaCy takes raw text as input, warts and newlines and all.
|
||||
|
||||
Top Peformance
|
||||
-------------
|
||||
--------------
|
||||
|
||||
* Fastest in the world: <50ms per document. No faster system has ever been
|
||||
announced.
|
||||
|
@ -38,14 +39,11 @@ Top Peformance
|
|||
Supports
|
||||
--------
|
||||
|
||||
* CPython 2.7
|
||||
* CPython 3.4
|
||||
* CPython 3.5
|
||||
* CPython 2.6, 2.7, 3.3, 3.4, 3.5 (only 64 bit)
|
||||
* OSX
|
||||
* Linux
|
||||
* Windows (Cygwin, MinGW, Visual Studio)
|
||||
|
||||
Difficult to support:
|
||||
|
||||
* PyPy 2.7
|
||||
* PyPy 3.4
|
||||
* PyPy
|
|
@ -1,29 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import io
|
||||
|
||||
def main(in_loc, out_loc):
|
||||
this_key = None
|
||||
this_freq = 0
|
||||
df = 0
|
||||
with io.open(out_loc, 'w', encoding='utf8') as out_file:
|
||||
for line in io.open(in_loc, encoding='utf8'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
freq, key = line.split('\t', 1)
|
||||
freq = int(freq)
|
||||
if this_key is not None and key != this_key:
|
||||
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||
this_key = key
|
||||
this_freq = freq
|
||||
df = 1
|
||||
else:
|
||||
this_freq += freq
|
||||
df += 1
|
||||
this_key = key
|
||||
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -8,7 +8,6 @@ from os import path
|
|||
import os
|
||||
import bz2
|
||||
import ujson
|
||||
import codecs
|
||||
from preshed.counter import PreshCounter
|
||||
from joblib import Parallel, delayed
|
||||
import io
|
||||
|
@ -37,7 +36,7 @@ def count_freqs(input_loc, output_loc):
|
|||
doc = tokenizer(json_comment['body'])
|
||||
doc.count_by(ORTH, counts=counts)
|
||||
|
||||
with codecs.open(output_loc, 'w', 'utf8') as file_:
|
||||
with io.open(output_loc, 'w', 'utf8') as file_:
|
||||
for orth, freq in counts:
|
||||
string = tokenizer.vocab.strings[orth]
|
||||
if not string.isspace():
|
||||
|
|
|
@ -98,7 +98,7 @@ def _read_probs(loc):
|
|||
return probs, probs['-OOV-']
|
||||
|
||||
|
||||
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
|
||||
def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
|
||||
if not loc.exists():
|
||||
print("Warning: Frequencies file not found")
|
||||
return {}, 0.0
|
||||
|
@ -125,7 +125,8 @@ def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
|
|||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||
word = literal_eval(key)
|
||||
# word = literal_eval(key)
|
||||
word = key
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
log_smooth_count = math.log(smooth_count)
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
|
@ -165,7 +166,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
|||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
|
||||
if not probs:
|
||||
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt.gz')
|
||||
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
|
||||
if not probs:
|
||||
oov_prob = -20
|
||||
else:
|
||||
|
@ -223,9 +224,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
|
|||
copyfile(str(lang_data_dir / 'gazetteer.json'),
|
||||
str(model_dir / 'vocab' / 'gazetteer.json'))
|
||||
|
||||
if (lang_data_dir / 'tag_map.json').exists():
|
||||
copyfile(str(lang_data_dir / 'tag_map.json'),
|
||||
str(model_dir / 'vocab' / 'tag_map.json'))
|
||||
copyfile(str(lang_data_dir / 'tag_map.json'),
|
||||
str(model_dir / 'vocab' / 'tag_map.json'))
|
||||
|
||||
if (lang_data_dir / 'lemma_rules.json').exists():
|
||||
copyfile(str(lang_data_dir / 'lemma_rules.json'),
|
||||
|
|
|
@ -14,6 +14,7 @@ import re
|
|||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.de import German
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
|
@ -25,6 +26,7 @@ from spacy.syntax.arc_eager import ArcEager
|
|||
from spacy.syntax.ner import BiluoPushDown
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.parser import Parser
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
|
@ -82,7 +84,7 @@ def _merge_sents(sents):
|
|||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
||||
beam_width=1, verbose=False,
|
||||
use_orig_arc_eager=False):
|
||||
use_orig_arc_eager=False, pseudoprojective=False):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
|
@ -96,9 +98,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
os.mkdir(ner_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
if pseudoprojective:
|
||||
# preprocess training data here before ArcEager.get_labels() is called
|
||||
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples),
|
||||
beam_width=beam_width)
|
||||
beam_width=beam_width,projectivize=pseudoprojective)
|
||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=BiluoPushDown.get_labels(gold_tuples),
|
||||
beam_width=0)
|
||||
|
@ -107,6 +113,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
gold_tuples = gold_tuples[:n_sents]
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
if nlp.lang == 'de':
|
||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
||||
|
@ -131,12 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
raw_text = add_noise(raw_text, corruption_level)
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples, make_projective=True)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
if not gold.is_projective:
|
||||
raise Exception(
|
||||
"Non-projective sentence in training, after we should "
|
||||
"have enforced projectivity: %s" % annot_tuples
|
||||
)
|
||||
raise Exception("Non-projective sentence in training: %s" % annot_tuples)
|
||||
loss += nlp.parser.train(tokens, gold)
|
||||
nlp.entity.train(tokens, gold)
|
||||
nlp.tagger.train(tokens, gold.tags)
|
||||
|
@ -152,6 +157,8 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None, cand_preproc=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if nlp.lang == 'de':
|
||||
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
scorer = Scorer()
|
||||
|
@ -200,6 +207,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
language=("The language to train", "positional", None, str, ['en','de']),
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
|
@ -211,19 +219,22 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
|||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
debug=("Debug mode", "flag", "d", bool),
|
||||
pseudoprojective=("Use pseudo-projective parsing", "flag", "p", bool),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
|
||||
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
|
||||
lang = {'en':English, 'de':German}.get(language)
|
||||
|
||||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
train(English, gold_train, model_dir,
|
||||
train(lang, gold_train, model_dir,
|
||||
feat_set='basic' if not debug else 'debug',
|
||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||
corruption_level=corruption_level, n_iter=n_iter,
|
||||
verbose=verbose)
|
||||
verbose=verbose,pseudoprojective=pseudoprojective)
|
||||
if out_loc:
|
||||
write_parses(English, dev_loc, model_dir, out_loc)
|
||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||
write_parses(lang, dev_loc, model_dir, out_loc)
|
||||
scorer = evaluate(lang, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
|
|
160
bin/tagger/train_german_tagger.py
Normal file
160
bin/tagger/train_german_tagger.py
Normal file
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import io
|
||||
import random
|
||||
import time
|
||||
import gzip
|
||||
import ujson
|
||||
|
||||
import plac
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy.util
|
||||
from spacy.de import German
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.scorer import PRFScore
|
||||
|
||||
from spacy.tagger import P2_orth, P2_cluster, P2_shape, P2_prefix, P2_suffix, P2_pos, P2_lemma, P2_flags
|
||||
from spacy.tagger import P1_orth, P1_cluster, P1_shape, P1_prefix, P1_suffix, P1_pos, P1_lemma, P1_flags
|
||||
from spacy.tagger import W_orth, W_cluster, W_shape, W_prefix, W_suffix, W_pos, W_lemma, W_flags
|
||||
from spacy.tagger import N1_orth, N1_cluster, N1_shape, N1_prefix, N1_suffix, N1_pos, N1_lemma, N1_flags
|
||||
from spacy.tagger import N2_orth, N2_cluster, N2_shape, N2_prefix, N2_suffix, N2_pos, N2_lemma, N2_flags, N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
def default_templates():
|
||||
return spacy.tagger.Tagger.default_templates()
|
||||
|
||||
def default_templates_without_clusters():
|
||||
return (
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
|
||||
(W_flags,),
|
||||
(N1_flags,),
|
||||
(N2_flags,),
|
||||
(P1_flags,),
|
||||
(P2_flags,),
|
||||
)
|
||||
|
||||
|
||||
def make_tagger(vocab, templates):
|
||||
model = spacy.tagger.TaggerModel(templates)
|
||||
return spacy.tagger.Tagger(vocab,model)
|
||||
|
||||
|
||||
def read_conll(file_):
|
||||
def sentences():
|
||||
words, tags = [], []
|
||||
for line in file_:
|
||||
line = line.strip()
|
||||
if line:
|
||||
word, tag = line.split('\t')[1::3][:2] # get column 1 and 4 (CoNLL09)
|
||||
words.append(word)
|
||||
tags.append(tag)
|
||||
elif words:
|
||||
yield words, tags
|
||||
words, tags = [], []
|
||||
if words:
|
||||
yield words, tags
|
||||
return [ s for s in sentences() ]
|
||||
|
||||
|
||||
def score_model(score, nlp, words, gold_tags):
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
assert(len(tokens) == len(gold_tags))
|
||||
nlp.tagger(tokens)
|
||||
|
||||
for token, gold_tag in zip(tokens,gold_tags):
|
||||
score.score_set(set([token.tag_]),set([gold_tag]))
|
||||
|
||||
|
||||
def train(Language, train_sents, dev_sents, model_dir, n_iter=15, seed=21):
|
||||
# make shuffling deterministic
|
||||
random.seed(seed)
|
||||
|
||||
# set up directory for model
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = make_tagger(nlp.vocab,default_templates())
|
||||
|
||||
print("Itn.\ttrain acc %\tdev acc %")
|
||||
for itn in range(n_iter):
|
||||
# train on train set
|
||||
#train_acc = PRFScore()
|
||||
correct, total = 0., 0.
|
||||
for words, gold_tags in train_sents:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
correct += nlp.tagger.train(tokens, gold_tags)
|
||||
total += len(words)
|
||||
train_acc = correct/total
|
||||
|
||||
# test on dev set
|
||||
dev_acc = PRFScore()
|
||||
for words, gold_tags in dev_sents:
|
||||
score_model(dev_acc, nlp, words, gold_tags)
|
||||
|
||||
random.shuffle(train_sents)
|
||||
print('%d:\t%6.2f\t%6.2f' % (itn, 100*train_acc, 100*dev_acc.precision))
|
||||
|
||||
|
||||
print('end training')
|
||||
nlp.end_training(model_dir)
|
||||
print('done')
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of CoNLL 09 formatted training file"),
|
||||
dev_loc=("Location of CoNLL 09 formatted development file"),
|
||||
model_dir=("Location of output model directory"),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, eval_only=False, n_iter=15):
|
||||
# training
|
||||
if not eval_only:
|
||||
with io.open(train_loc, 'r', encoding='utf8') as trainfile_, \
|
||||
io.open(dev_loc, 'r', encoding='utf8') as devfile_:
|
||||
train_sents = read_conll(trainfile_)
|
||||
dev_sents = read_conll(devfile_)
|
||||
train(German, train_sents, dev_sents, model_dir, n_iter=n_iter)
|
||||
|
||||
# testing
|
||||
with io.open(dev_loc, 'r', encoding='utf8') as file_:
|
||||
dev_sents = read_conll(file_)
|
||||
nlp = German(data_dir=model_dir)
|
||||
|
||||
dev_acc = PRFScore()
|
||||
for words, gold_tags in dev_sents:
|
||||
score_model(dev_acc, nlp, words, gold_tags)
|
||||
|
||||
print('POS: %6.2f %%' % (100*dev_acc.precision))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,18 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
HERE=`pwd`
|
||||
cd /tmp
|
||||
wget http://www.python.org/ftp/python/2.7.5/Python-2.7.5.tgz /tmp
|
||||
tar -zxvf Python-2.7.5.tgz
|
||||
cd Python-2.7.5
|
||||
mkdir $HERE/.python
|
||||
./configure --prefix=$HERE/.python
|
||||
make
|
||||
make install
|
||||
cd /tmp
|
||||
wget --no-check-certificate https://pypi.python.org/packages/source/v/virtualenv/virtualenv-1.11.6.tar.gz
|
||||
tar -zxvf virtualenv-1.11.6.tar.gz
|
||||
cd virtualenv-1.11.6/
|
||||
$HERE/.python/bin/python setup.py install
|
||||
cd $HERE
|
||||
$HERE/.python/bin/python /tmp/virtualenv-1.11.6/virtualenv.py .env -p $HERE/.python/bin/python2.7
|
|
@ -1,18 +1,23 @@
|
|||
{
|
||||
"build": {
|
||||
"sdist": [
|
||||
"python pip-clear.py",
|
||||
"pip install -r requirements.txt",
|
||||
"python setup.py sdist"
|
||||
],
|
||||
"install": [
|
||||
"python pip-clear.py",
|
||||
"pip install source.tar.gz",
|
||||
"python -m spacy.en.download --force"
|
||||
"pip install -v source.tar.gz"
|
||||
],
|
||||
"wheel": [
|
||||
"python untar.py source.tar.gz .",
|
||||
"python setup.py bdist_wheel",
|
||||
"python cpdist.py dist"
|
||||
]
|
||||
},
|
||||
"test": {
|
||||
"after": ["install"],
|
||||
"after": ["install", "wheel"],
|
||||
"run": [
|
||||
"python -m spacy.en.download --force"
|
||||
],
|
||||
"package": "spacy",
|
||||
"args": "--tb=native -x --models --vectors --slow"
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ Question:
|
|||
|
||||
In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
|
||||
|
||||
Lets take the example sentence on https://api.spacy.io/displacy/index.html
|
||||
Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
|
||||
|
||||
displaCy uses CSS and JavaScript to show you how computers understand language
|
||||
This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
|
||||
|
|
|
@ -58,7 +58,7 @@ def save_parses(batch_id, input_, out_dir, n_threads, batch_size):
|
|||
n_thread=("Number of threads per process", "option", "t", int),
|
||||
batch_size=("Number of texts to accumulate in a buffer", "option", "b", int)
|
||||
)
|
||||
def main(in_loc, out_dir, n_process=1, n_thread=4):
|
||||
def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100):
|
||||
if not path.exists(out_dir):
|
||||
path.join(out_dir)
|
||||
if n_process >= 2:
|
||||
|
|
|
@ -54,7 +54,7 @@ def represent_word(word):
|
|||
# Only do this if the lower-cased form is more probable.
|
||||
if text.istitle() \
|
||||
and is_sent_begin(word) \
|
||||
and word.prob < word.vocab[text.lower()].prob:
|
||||
and word.prob < word.doc.vocab[text.lower()].prob:
|
||||
text = text.lower()
|
||||
return text + '|' + word.tag_
|
||||
|
||||
|
|
319
lang_data/de/abbrev.de.tab
Normal file
319
lang_data/de/abbrev.de.tab
Normal file
|
@ -0,0 +1,319 @@
|
|||
# surface form lemma pos
|
||||
# multiple values are separated by |
|
||||
# empty lines and lines starting with # are being ignored
|
||||
|
||||
'' ''
|
||||
\") \")
|
||||
\n \n <nl> SP
|
||||
\t \t <tab> SP
|
||||
<space> SP
|
||||
|
||||
# example: Wie geht's?
|
||||
's 's es
|
||||
'S 'S es
|
||||
|
||||
# example: Haste mal 'nen Euro?
|
||||
'n 'n ein
|
||||
'ne 'ne eine
|
||||
'nen 'nen einen
|
||||
|
||||
# example: Kommen S’ nur herein!
|
||||
s' s' sie
|
||||
S' S' sie
|
||||
|
||||
# example: Da haben wir's!
|
||||
ich's ich|'s ich|es
|
||||
du's du|'s du|es
|
||||
er's er|'s er|es
|
||||
sie's sie|'s sie|es
|
||||
wir's wir|'s wir|es
|
||||
ihr's ihr|'s ihr|es
|
||||
|
||||
# example: Die katze auf'm dach.
|
||||
auf'm auf|'m auf|dem
|
||||
unter'm unter|'m unter|dem
|
||||
über'm über|'m über|dem
|
||||
vor'm vor|'m vor|dem
|
||||
hinter'm hinter|'m hinter|dem
|
||||
|
||||
# persons
|
||||
B.A. B.A.
|
||||
B.Sc. B.Sc.
|
||||
Dipl. Dipl.
|
||||
Dipl.-Ing. Dipl.-Ing.
|
||||
Dr. Dr.
|
||||
Fr. Fr.
|
||||
Frl. Frl.
|
||||
Hr. Hr.
|
||||
Hrn. Hrn.
|
||||
Frl. Frl.
|
||||
Prof. Prof.
|
||||
St. St.
|
||||
Hrgs. Hrgs.
|
||||
Hg. Hg.
|
||||
a.Z. a.Z.
|
||||
a.D. a.D.
|
||||
h.c. h.c.
|
||||
Jr. Jr.
|
||||
jr. jr.
|
||||
jun. jun.
|
||||
sen. sen.
|
||||
rer. rer.
|
||||
Ing. Ing.
|
||||
M.A. M.A.
|
||||
Mr. Mr.
|
||||
M.Sc. M.Sc.
|
||||
nat. nat.
|
||||
phil. phil.
|
||||
|
||||
# companies
|
||||
Co. Co.
|
||||
co. co.
|
||||
Cie. Cie.
|
||||
A.G. A.G.
|
||||
G.m.b.H. G.m.b.H.
|
||||
i.G. i.G.
|
||||
e.V. e.V.
|
||||
|
||||
# popular german abbreviations
|
||||
Abb. Abb.
|
||||
Abk. Abk.
|
||||
Abs. Abs.
|
||||
Abt. Abt.
|
||||
abzgl. abzgl.
|
||||
allg. allg.
|
||||
a.M. a.M.
|
||||
Bd. Bd.
|
||||
betr. betr.
|
||||
Betr. Betr.
|
||||
Biol. Biol.
|
||||
biol. biol.
|
||||
Bf. Bf.
|
||||
Bhf. Bhf.
|
||||
Bsp. Bsp.
|
||||
bspw. bspw.
|
||||
bzgl. bzgl.
|
||||
bzw. bzw.
|
||||
d.h. d.h.
|
||||
dgl. dgl.
|
||||
ebd. ebd.
|
||||
ehem. ehem.
|
||||
eigtl. eigtl.
|
||||
entspr. entspr.
|
||||
erm. erm.
|
||||
ev. ev.
|
||||
evtl. evtl.
|
||||
Fa. Fa.
|
||||
Fam. Fam.
|
||||
geb. geb.
|
||||
Gebr. Gebr.
|
||||
gem. gem.
|
||||
ggf. ggf.
|
||||
ggü. ggü.
|
||||
ggfs. ggfs.
|
||||
gegr. gegr.
|
||||
Hbf. Hbf.
|
||||
Hrsg. Hrsg.
|
||||
hrsg. hrsg.
|
||||
i.A. i.A.
|
||||
i.d.R. i.d.R.
|
||||
inkl. inkl.
|
||||
insb. insb.
|
||||
i.O. i.O.
|
||||
i.Tr. i.Tr.
|
||||
i.V. i.V.
|
||||
jur. jur.
|
||||
kath. kath.
|
||||
K.O. K.O.
|
||||
lt. lt.
|
||||
max. max.
|
||||
m.E. m.E.
|
||||
m.M. m.M.
|
||||
mtl. mtl.
|
||||
min. min.
|
||||
mind. mind.
|
||||
MwSt. MwSt.
|
||||
Nr. Nr.
|
||||
o.a. o.a.
|
||||
o.ä. o.ä.
|
||||
o.Ä. o.Ä.
|
||||
o.g. o.g.
|
||||
o.k. o.k.
|
||||
O.K. O.K.
|
||||
Orig. Orig.
|
||||
orig. orig.
|
||||
pers. pers.
|
||||
Pkt. Pkt.
|
||||
Red. Red.
|
||||
röm. röm.
|
||||
s.o. s.o.
|
||||
sog. sog.
|
||||
std. std.
|
||||
stellv. stellv.
|
||||
Str. Str.
|
||||
tägl. tägl.
|
||||
Tel. Tel.
|
||||
u.a. u.a.
|
||||
usf. usf.
|
||||
u.s.w. u.s.w.
|
||||
usw. usw.
|
||||
u.U. u.U.
|
||||
u.v.m. u.v.m.
|
||||
uvm. uvm.
|
||||
v.a. v.a.
|
||||
vgl. vgl.
|
||||
vllt. vllt.
|
||||
v.l.n.r. v.l.n.r.
|
||||
vlt. vlt.
|
||||
Vol. Vol.
|
||||
wiss. wiss.
|
||||
Univ. Univ.
|
||||
z.B. z.B.
|
||||
z.b. z.b.
|
||||
z.Bsp. z.Bsp.
|
||||
z.T. z.T.
|
||||
z.Z. z.Z.
|
||||
zzgl. zzgl.
|
||||
z.Zt. z.Zt.
|
||||
|
||||
# popular latin abbreviations
|
||||
vs. vs.
|
||||
adv. adv.
|
||||
Chr. Chr.
|
||||
A.C. A.C.
|
||||
A.D. A.D.
|
||||
e.g. e.g.
|
||||
i.e. i.e.
|
||||
al. al.
|
||||
p.a. p.a.
|
||||
P.S. P.S.
|
||||
q.e.d. q.e.d.
|
||||
R.I.P. R.I.P.
|
||||
etc. etc.
|
||||
incl. incl.
|
||||
ca. ca.
|
||||
n.Chr. n.Chr.
|
||||
p.s. p.s.
|
||||
v.Chr. v.Chr.
|
||||
|
||||
# popular english abbreviations
|
||||
D.C. D.C.
|
||||
N.Y. N.Y.
|
||||
N.Y.C. N.Y.C.
|
||||
U.S. U.S.
|
||||
U.S.A. U.S.A.
|
||||
L.A. L.A.
|
||||
U.S.S. U.S.S.
|
||||
|
||||
# dates & time
|
||||
Jan. Jan.
|
||||
Feb. Feb.
|
||||
Mrz. Mrz.
|
||||
Mär. Mär.
|
||||
Apr. Apr.
|
||||
Jun. Jun.
|
||||
Jul. Jul.
|
||||
Aug. Aug.
|
||||
Sep. Sep.
|
||||
Sept. Sept.
|
||||
Okt. Okt.
|
||||
Nov. Nov.
|
||||
Dez. Dez.
|
||||
Mo. Mo.
|
||||
Di. Di.
|
||||
Mi. Mi.
|
||||
Do. Do.
|
||||
Fr. Fr.
|
||||
Sa. Sa.
|
||||
So. So.
|
||||
Std. Std.
|
||||
Jh. Jh.
|
||||
Jhd. Jhd.
|
||||
|
||||
# numbers
|
||||
Tsd. Tsd.
|
||||
Mio. Mio.
|
||||
Mrd. Mrd.
|
||||
|
||||
# countries & languages
|
||||
engl. engl.
|
||||
frz. frz.
|
||||
lat. lat.
|
||||
österr. österr.
|
||||
|
||||
# smileys
|
||||
:) :)
|
||||
<3 <3
|
||||
;) ;)
|
||||
(: (:
|
||||
:( :(
|
||||
-_- -_-
|
||||
=) =)
|
||||
:/ :/
|
||||
:> :>
|
||||
;-) ;-)
|
||||
:Y :Y
|
||||
:P :P
|
||||
:-P :-P
|
||||
:3 :3
|
||||
=3 =3
|
||||
xD xD
|
||||
^_^ ^_^
|
||||
=] =]
|
||||
=D =D
|
||||
<333 <333
|
||||
:)) :))
|
||||
:0 :0
|
||||
-__- -__-
|
||||
xDD xDD
|
||||
o_o o_o
|
||||
o_O o_O
|
||||
V_V V_V
|
||||
=[[ =[[
|
||||
<33 <33
|
||||
;p ;p
|
||||
;D ;D
|
||||
;-p ;-p
|
||||
;( ;(
|
||||
:p :p
|
||||
:] :]
|
||||
:O :O
|
||||
:-/ :-/
|
||||
:-) :-)
|
||||
:((( :(((
|
||||
:(( :((
|
||||
:') :')
|
||||
(^_^) (^_^)
|
||||
(= (=
|
||||
o.O o.O
|
||||
|
||||
# single letters
|
||||
a. a.
|
||||
b. b.
|
||||
c. c.
|
||||
d. d.
|
||||
e. e.
|
||||
f. f.
|
||||
g. g.
|
||||
h. h.
|
||||
i. i.
|
||||
j. j.
|
||||
k. k.
|
||||
l. l.
|
||||
m. m.
|
||||
n. n.
|
||||
o. o.
|
||||
p. p.
|
||||
q. q.
|
||||
r. r.
|
||||
s. s.
|
||||
t. t.
|
||||
u. u.
|
||||
v. v.
|
||||
w. w.
|
||||
x. x.
|
||||
y. y.
|
||||
z. z.
|
||||
ä. ä.
|
||||
ö. ö.
|
||||
ü. ü.
|
194
lang_data/de/gazetteer.json
Normal file
194
lang_data/de/gazetteer.json
Normal file
|
@ -0,0 +1,194 @@
|
|||
{
|
||||
"Reddit": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "reddit"}]
|
||||
]
|
||||
],
|
||||
"SeptemberElevenAttacks": [
|
||||
"EVENT",
|
||||
{},
|
||||
[
|
||||
[
|
||||
{"orth": "9/11"}
|
||||
],
|
||||
[
|
||||
{"lower": "september"},
|
||||
{"orth": "11"}
|
||||
]
|
||||
]
|
||||
],
|
||||
"Linux": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "linux"}]
|
||||
]
|
||||
],
|
||||
"Haskell": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "haskell"}]
|
||||
]
|
||||
],
|
||||
"HaskellCurry": [
|
||||
"PERSON",
|
||||
{},
|
||||
[
|
||||
[
|
||||
{"lower": "haskell"},
|
||||
{"lower": "curry"}
|
||||
]
|
||||
]
|
||||
],
|
||||
"Javascript": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "javascript"}]
|
||||
]
|
||||
],
|
||||
"CSS": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "css"}],
|
||||
[{"lower": "css3"}]
|
||||
]
|
||||
],
|
||||
"displaCy": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "displacy"}]
|
||||
]
|
||||
],
|
||||
"spaCy": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "spaCy"}]
|
||||
]
|
||||
],
|
||||
|
||||
"HTML": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "html"}],
|
||||
[{"lower": "html5"}]
|
||||
]
|
||||
],
|
||||
"Python": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Python"}]
|
||||
]
|
||||
],
|
||||
"Ruby": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Ruby"}]
|
||||
]
|
||||
],
|
||||
"Digg": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "digg"}]
|
||||
]
|
||||
],
|
||||
"FoxNews": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Fox"}],
|
||||
[{"orth": "News"}]
|
||||
]
|
||||
],
|
||||
"Google": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "google"}]
|
||||
]
|
||||
],
|
||||
"Mac": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "mac"}]
|
||||
]
|
||||
],
|
||||
"Wikipedia": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "wikipedia"}]
|
||||
]
|
||||
],
|
||||
"Windows": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Windows"}]
|
||||
]
|
||||
],
|
||||
"Dell": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "dell"}]
|
||||
]
|
||||
],
|
||||
"Facebook": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"lower": "facebook"}]
|
||||
]
|
||||
],
|
||||
"Blizzard": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Blizzard"}]
|
||||
]
|
||||
],
|
||||
"Ubuntu": [
|
||||
"ORG",
|
||||
{},
|
||||
[
|
||||
[{"orth": "Ubuntu"}]
|
||||
]
|
||||
],
|
||||
"Youtube": [
|
||||
"PRODUCT",
|
||||
{},
|
||||
[
|
||||
[{"lower": "youtube"}]
|
||||
]
|
||||
],
|
||||
"false_positives": [
|
||||
null,
|
||||
{},
|
||||
[
|
||||
[{"orth": "Shit"}],
|
||||
[{"orth": "Weed"}],
|
||||
[{"orth": "Cool"}],
|
||||
[{"orth": "Btw"}],
|
||||
[{"orth": "Bah"}],
|
||||
[{"orth": "Bullshit"}],
|
||||
[{"orth": "Lol"}],
|
||||
[{"orth": "Yo"}, {"lower": "dawg"}],
|
||||
[{"orth": "Yay"}],
|
||||
[{"orth": "Ahh"}],
|
||||
[{"orth": "Yea"}],
|
||||
[{"orth": "Bah"}]
|
||||
]
|
||||
]
|
||||
}
|
|
@ -1,5 +1,7 @@
|
|||
# coding=utf8
|
||||
import json
|
||||
import io
|
||||
import itertools
|
||||
|
||||
contractions = {}
|
||||
|
||||
|
@ -262,14 +264,30 @@ def get_token_properties(token, capitalize=False, remove_contractions=False):
|
|||
props["F"] = token
|
||||
return props
|
||||
|
||||
def create_entry(token, endings, capitalize=False, remove_contractions=False):
|
||||
|
||||
def create_entry(token, endings, capitalize=False, remove_contractions=False):
|
||||
properties = []
|
||||
properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
|
||||
for e in endings:
|
||||
properties.append(get_token_properties(e, remove_contractions=remove_contractions))
|
||||
return properties
|
||||
|
||||
|
||||
FIELDNAMES = ['F','L','pos']
|
||||
def read_hardcoded(stream):
|
||||
hc_specials = {}
|
||||
for line in stream:
|
||||
line = line.strip()
|
||||
if line.startswith('#') or not line:
|
||||
continue
|
||||
key,_,rest = line.partition('\t')
|
||||
values = []
|
||||
for annotation in zip(*[ e.split('|') for e in rest.split('\t') ]):
|
||||
values.append({ k:v for k,v in itertools.izip_longest(FIELDNAMES,annotation) if v })
|
||||
hc_specials[key] = values
|
||||
return hc_specials
|
||||
|
||||
|
||||
def generate_specials():
|
||||
|
||||
specials = {}
|
||||
|
@ -303,7 +321,10 @@ def generate_specials():
|
|||
specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
|
||||
|
||||
# add in hardcoded specials
|
||||
specials = dict(specials, **hardcoded_specials)
|
||||
# changed it so it generates them from a file
|
||||
with io.open('abbrev.de.tab','r',encoding='utf8') as abbrev_:
|
||||
hc_specials = read_hardcoded(abbrev_)
|
||||
specials = dict(specials, **hc_specials)
|
||||
|
||||
return specials
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
\.\.\.
|
||||
(?<=[a-z])\.(?=[A-Z])
|
||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
|
||||
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
{
|
||||
*
|
||||
<
|
||||
>
|
||||
$
|
||||
£
|
||||
„
|
||||
|
@ -20,3 +21,7 @@ a-
|
|||
‘
|
||||
....
|
||||
...
|
||||
‚
|
||||
»
|
||||
_
|
||||
§
|
||||
|
|
|
@ -1,27 +1,4 @@
|
|||
{
|
||||
"\t": [
|
||||
{
|
||||
"F": "\t",
|
||||
"pos": "SP"
|
||||
}
|
||||
],
|
||||
"\n": [
|
||||
{
|
||||
"F": "\n",
|
||||
"pos": "SP"
|
||||
}
|
||||
],
|
||||
" ": [
|
||||
{
|
||||
"F": " ",
|
||||
"pos": "SP"
|
||||
}
|
||||
],
|
||||
"\")": [
|
||||
{
|
||||
"F": "\")"
|
||||
}
|
||||
],
|
||||
"''": [
|
||||
{
|
||||
"F": "''"
|
||||
|
@ -217,6 +194,11 @@
|
|||
"F": "<333"
|
||||
}
|
||||
],
|
||||
"<space>": [
|
||||
{
|
||||
"F": "SP"
|
||||
}
|
||||
],
|
||||
"=)": [
|
||||
{
|
||||
"F": "=)"
|
||||
|
@ -267,6 +249,16 @@
|
|||
"F": "Abk."
|
||||
}
|
||||
],
|
||||
"Abs.": [
|
||||
{
|
||||
"F": "Abs."
|
||||
}
|
||||
],
|
||||
"Abt.": [
|
||||
{
|
||||
"F": "Abt."
|
||||
}
|
||||
],
|
||||
"Apr.": [
|
||||
{
|
||||
"F": "Apr."
|
||||
|
@ -277,6 +269,26 @@
|
|||
"F": "Aug."
|
||||
}
|
||||
],
|
||||
"B.A.": [
|
||||
{
|
||||
"F": "B.A."
|
||||
}
|
||||
],
|
||||
"B.Sc.": [
|
||||
{
|
||||
"F": "B.Sc."
|
||||
}
|
||||
],
|
||||
"Bd.": [
|
||||
{
|
||||
"F": "Bd."
|
||||
}
|
||||
],
|
||||
"Betr.": [
|
||||
{
|
||||
"F": "Betr."
|
||||
}
|
||||
],
|
||||
"Bf.": [
|
||||
{
|
||||
"F": "Bf."
|
||||
|
@ -292,6 +304,11 @@
|
|||
"F": "Biol."
|
||||
}
|
||||
],
|
||||
"Bsp.": [
|
||||
{
|
||||
"F": "Bsp."
|
||||
}
|
||||
],
|
||||
"Chr.": [
|
||||
{
|
||||
"F": "Chr."
|
||||
|
@ -342,6 +359,16 @@
|
|||
"F": "Dr."
|
||||
}
|
||||
],
|
||||
"Fa.": [
|
||||
{
|
||||
"F": "Fa."
|
||||
}
|
||||
],
|
||||
"Fam.": [
|
||||
{
|
||||
"F": "Fam."
|
||||
}
|
||||
],
|
||||
"Feb.": [
|
||||
{
|
||||
"F": "Feb."
|
||||
|
@ -387,6 +414,16 @@
|
|||
"F": "Hrgs."
|
||||
}
|
||||
],
|
||||
"Hrn.": [
|
||||
{
|
||||
"F": "Hrn."
|
||||
}
|
||||
],
|
||||
"Hrsg.": [
|
||||
{
|
||||
"F": "Hrsg."
|
||||
}
|
||||
],
|
||||
"Ing.": [
|
||||
{
|
||||
"F": "Ing."
|
||||
|
@ -397,11 +434,21 @@
|
|||
"F": "Jan."
|
||||
}
|
||||
],
|
||||
"Jh.": [
|
||||
{
|
||||
"F": "Jh."
|
||||
}
|
||||
],
|
||||
"Jhd.": [
|
||||
{
|
||||
"F": "Jhd."
|
||||
}
|
||||
],
|
||||
"Jr.": [
|
||||
{
|
||||
"F": "Jr."
|
||||
}
|
||||
],
|
||||
"Jul.": [
|
||||
{
|
||||
"F": "Jul."
|
||||
|
@ -412,21 +459,61 @@
|
|||
"F": "Jun."
|
||||
}
|
||||
],
|
||||
"K.O.": [
|
||||
{
|
||||
"F": "K.O."
|
||||
}
|
||||
],
|
||||
"L.A.": [
|
||||
{
|
||||
"F": "L.A."
|
||||
}
|
||||
],
|
||||
"M.A.": [
|
||||
{
|
||||
"F": "M.A."
|
||||
}
|
||||
],
|
||||
"M.Sc.": [
|
||||
{
|
||||
"F": "M.Sc."
|
||||
}
|
||||
],
|
||||
"Mi.": [
|
||||
{
|
||||
"F": "Mi."
|
||||
}
|
||||
],
|
||||
"Mio.": [
|
||||
{
|
||||
"F": "Mio."
|
||||
}
|
||||
],
|
||||
"Mo.": [
|
||||
{
|
||||
"F": "Mo."
|
||||
}
|
||||
],
|
||||
"Mr.": [
|
||||
{
|
||||
"F": "Mr."
|
||||
}
|
||||
],
|
||||
"Mrd.": [
|
||||
{
|
||||
"F": "Mrd."
|
||||
}
|
||||
],
|
||||
"Mrz.": [
|
||||
{
|
||||
"F": "Mrz."
|
||||
}
|
||||
],
|
||||
"MwSt.": [
|
||||
{
|
||||
"F": "MwSt."
|
||||
}
|
||||
],
|
||||
"M\u00e4r.": [
|
||||
{
|
||||
"F": "M\u00e4r."
|
||||
|
@ -452,16 +539,31 @@
|
|||
"F": "Nr."
|
||||
}
|
||||
],
|
||||
"O.K.": [
|
||||
{
|
||||
"F": "O.K."
|
||||
}
|
||||
],
|
||||
"Okt.": [
|
||||
{
|
||||
"F": "Okt."
|
||||
}
|
||||
],
|
||||
"Orig.": [
|
||||
{
|
||||
"F": "Orig."
|
||||
}
|
||||
],
|
||||
"P.S.": [
|
||||
{
|
||||
"F": "P.S."
|
||||
}
|
||||
],
|
||||
"Pkt.": [
|
||||
{
|
||||
"F": "Pkt."
|
||||
}
|
||||
],
|
||||
"Prof.": [
|
||||
{
|
||||
"F": "Prof."
|
||||
|
@ -472,6 +574,11 @@
|
|||
"F": "R.I.P."
|
||||
}
|
||||
],
|
||||
"Red.": [
|
||||
{
|
||||
"F": "Red."
|
||||
}
|
||||
],
|
||||
"S'": [
|
||||
{
|
||||
"F": "S'",
|
||||
|
@ -503,6 +610,41 @@
|
|||
"F": "St."
|
||||
}
|
||||
],
|
||||
"Std.": [
|
||||
{
|
||||
"F": "Std."
|
||||
}
|
||||
],
|
||||
"Str.": [
|
||||
{
|
||||
"F": "Str."
|
||||
}
|
||||
],
|
||||
"Tel.": [
|
||||
{
|
||||
"F": "Tel."
|
||||
}
|
||||
],
|
||||
"Tsd.": [
|
||||
{
|
||||
"F": "Tsd."
|
||||
}
|
||||
],
|
||||
"U.S.": [
|
||||
{
|
||||
"F": "U.S."
|
||||
}
|
||||
],
|
||||
"U.S.A.": [
|
||||
{
|
||||
"F": "U.S.A."
|
||||
}
|
||||
],
|
||||
"U.S.S.": [
|
||||
{
|
||||
"F": "U.S.S."
|
||||
}
|
||||
],
|
||||
"Univ.": [
|
||||
{
|
||||
"F": "Univ."
|
||||
|
@ -513,6 +655,30 @@
|
|||
"F": "V_V"
|
||||
}
|
||||
],
|
||||
"Vol.": [
|
||||
{
|
||||
"F": "Vol."
|
||||
}
|
||||
],
|
||||
"\\\")": [
|
||||
{
|
||||
"F": "\\\")"
|
||||
}
|
||||
],
|
||||
"\\n": [
|
||||
{
|
||||
"F": "\\n",
|
||||
"L": "<nl>",
|
||||
"pos": "SP"
|
||||
}
|
||||
],
|
||||
"\\t": [
|
||||
{
|
||||
"F": "\\t",
|
||||
"L": "<tab>",
|
||||
"pos": "SP"
|
||||
}
|
||||
],
|
||||
"^_^": [
|
||||
{
|
||||
"F": "^_^"
|
||||
|
@ -528,6 +694,11 @@
|
|||
"F": "a.D."
|
||||
}
|
||||
],
|
||||
"a.M.": [
|
||||
{
|
||||
"F": "a.M."
|
||||
}
|
||||
],
|
||||
"a.Z.": [
|
||||
{
|
||||
"F": "a.Z."
|
||||
|
@ -548,9 +719,15 @@
|
|||
"F": "al."
|
||||
}
|
||||
],
|
||||
"allg.": [
|
||||
{
|
||||
"F": "allg."
|
||||
}
|
||||
],
|
||||
"auf'm": [
|
||||
{
|
||||
"F": "auf"
|
||||
"F": "auf",
|
||||
"L": "auf"
|
||||
},
|
||||
{
|
||||
"F": "'m",
|
||||
|
@ -572,11 +749,31 @@
|
|||
"F": "biol."
|
||||
}
|
||||
],
|
||||
"bspw.": [
|
||||
{
|
||||
"F": "bspw."
|
||||
}
|
||||
],
|
||||
"bzgl.": [
|
||||
{
|
||||
"F": "bzgl."
|
||||
}
|
||||
],
|
||||
"bzw.": [
|
||||
{
|
||||
"F": "bzw."
|
||||
}
|
||||
],
|
||||
"c.": [
|
||||
{
|
||||
"F": "c."
|
||||
}
|
||||
],
|
||||
"ca.": [
|
||||
{
|
||||
"F": "ca."
|
||||
}
|
||||
],
|
||||
"co.": [
|
||||
{
|
||||
"F": "co."
|
||||
|
@ -587,9 +784,20 @@
|
|||
"F": "d."
|
||||
}
|
||||
],
|
||||
"d.h.": [
|
||||
{
|
||||
"F": "d.h."
|
||||
}
|
||||
],
|
||||
"dgl.": [
|
||||
{
|
||||
"F": "dgl."
|
||||
}
|
||||
],
|
||||
"du's": [
|
||||
{
|
||||
"F": "du"
|
||||
"F": "du",
|
||||
"L": "du"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
|
@ -611,19 +819,35 @@
|
|||
"F": "e.g."
|
||||
}
|
||||
],
|
||||
"ebd.": [
|
||||
{
|
||||
"F": "ebd."
|
||||
}
|
||||
],
|
||||
"ehem.": [
|
||||
{
|
||||
"F": "ehem."
|
||||
}
|
||||
],
|
||||
"eigtl.": [
|
||||
{
|
||||
"F": "eigtl."
|
||||
}
|
||||
],
|
||||
"engl.": [
|
||||
{
|
||||
"F": "engl."
|
||||
}
|
||||
],
|
||||
"entspr.": [
|
||||
{
|
||||
"F": "entspr."
|
||||
}
|
||||
],
|
||||
"er's": [
|
||||
{
|
||||
"F": "er"
|
||||
"F": "er",
|
||||
"L": "er"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
|
@ -640,11 +864,26 @@
|
|||
"F": "etc."
|
||||
}
|
||||
],
|
||||
"ev.": [
|
||||
{
|
||||
"F": "ev."
|
||||
}
|
||||
],
|
||||
"evtl.": [
|
||||
{
|
||||
"F": "evtl."
|
||||
}
|
||||
],
|
||||
"f.": [
|
||||
{
|
||||
"F": "f."
|
||||
}
|
||||
],
|
||||
"frz.": [
|
||||
{
|
||||
"F": "frz."
|
||||
}
|
||||
],
|
||||
"g.": [
|
||||
{
|
||||
"F": "g."
|
||||
|
@ -660,6 +899,11 @@
|
|||
"F": "gegr."
|
||||
}
|
||||
],
|
||||
"gem.": [
|
||||
{
|
||||
"F": "gem."
|
||||
}
|
||||
],
|
||||
"ggf.": [
|
||||
{
|
||||
"F": "ggf."
|
||||
|
@ -687,23 +931,39 @@
|
|||
],
|
||||
"hinter'm": [
|
||||
{
|
||||
"F": "hinter"
|
||||
"F": "hinter",
|
||||
"L": "hinter"
|
||||
},
|
||||
{
|
||||
"F": "'m",
|
||||
"L": "dem"
|
||||
}
|
||||
],
|
||||
"hrsg.": [
|
||||
{
|
||||
"F": "hrsg."
|
||||
}
|
||||
],
|
||||
"i.": [
|
||||
{
|
||||
"F": "i."
|
||||
}
|
||||
],
|
||||
"i.A.": [
|
||||
{
|
||||
"F": "i.A."
|
||||
}
|
||||
],
|
||||
"i.G.": [
|
||||
{
|
||||
"F": "i.G."
|
||||
}
|
||||
],
|
||||
"i.O.": [
|
||||
{
|
||||
"F": "i.O."
|
||||
}
|
||||
],
|
||||
"i.Tr.": [
|
||||
{
|
||||
"F": "i.Tr."
|
||||
|
@ -714,6 +974,11 @@
|
|||
"F": "i.V."
|
||||
}
|
||||
],
|
||||
"i.d.R.": [
|
||||
{
|
||||
"F": "i.d.R."
|
||||
}
|
||||
],
|
||||
"i.e.": [
|
||||
{
|
||||
"F": "i.e."
|
||||
|
@ -721,7 +986,8 @@
|
|||
],
|
||||
"ich's": [
|
||||
{
|
||||
"F": "ich"
|
||||
"F": "ich",
|
||||
"L": "ich"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
|
@ -730,7 +996,8 @@
|
|||
],
|
||||
"ihr's": [
|
||||
{
|
||||
"F": "ihr"
|
||||
"F": "ihr",
|
||||
"L": "ihr"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
|
@ -757,6 +1024,11 @@
|
|||
"F": "j."
|
||||
}
|
||||
],
|
||||
"jr.": [
|
||||
{
|
||||
"F": "jr."
|
||||
}
|
||||
],
|
||||
"jun.": [
|
||||
{
|
||||
"F": "jun."
|
||||
|
@ -772,11 +1044,21 @@
|
|||
"F": "k."
|
||||
}
|
||||
],
|
||||
"kath.": [
|
||||
{
|
||||
"F": "kath."
|
||||
}
|
||||
],
|
||||
"l.": [
|
||||
{
|
||||
"F": "l."
|
||||
}
|
||||
],
|
||||
"lat.": [
|
||||
{
|
||||
"F": "lat."
|
||||
}
|
||||
],
|
||||
"lt.": [
|
||||
{
|
||||
"F": "lt."
|
||||
|
@ -787,11 +1069,46 @@
|
|||
"F": "m."
|
||||
}
|
||||
],
|
||||
"m.E.": [
|
||||
{
|
||||
"F": "m.E."
|
||||
}
|
||||
],
|
||||
"m.M.": [
|
||||
{
|
||||
"F": "m.M."
|
||||
}
|
||||
],
|
||||
"max.": [
|
||||
{
|
||||
"F": "max."
|
||||
}
|
||||
],
|
||||
"min.": [
|
||||
{
|
||||
"F": "min."
|
||||
}
|
||||
],
|
||||
"mind.": [
|
||||
{
|
||||
"F": "mind."
|
||||
}
|
||||
],
|
||||
"mtl.": [
|
||||
{
|
||||
"F": "mtl."
|
||||
}
|
||||
],
|
||||
"n.": [
|
||||
{
|
||||
"F": "n."
|
||||
}
|
||||
],
|
||||
"n.Chr.": [
|
||||
{
|
||||
"F": "n.Chr."
|
||||
}
|
||||
],
|
||||
"nat.": [
|
||||
{
|
||||
"F": "nat."
|
||||
|
@ -807,6 +1124,31 @@
|
|||
"F": "o.O"
|
||||
}
|
||||
],
|
||||
"o.a.": [
|
||||
{
|
||||
"F": "o.a."
|
||||
}
|
||||
],
|
||||
"o.g.": [
|
||||
{
|
||||
"F": "o.g."
|
||||
}
|
||||
],
|
||||
"o.k.": [
|
||||
{
|
||||
"F": "o.k."
|
||||
}
|
||||
],
|
||||
"o.\u00c4.": [
|
||||
{
|
||||
"F": "o.\u00c4."
|
||||
}
|
||||
],
|
||||
"o.\u00e4.": [
|
||||
{
|
||||
"F": "o.\u00e4."
|
||||
}
|
||||
],
|
||||
"o_O": [
|
||||
{
|
||||
"F": "o_O"
|
||||
|
@ -817,6 +1159,11 @@
|
|||
"F": "o_o"
|
||||
}
|
||||
],
|
||||
"orig.": [
|
||||
{
|
||||
"F": "orig."
|
||||
}
|
||||
],
|
||||
"p.": [
|
||||
{
|
||||
"F": "p."
|
||||
|
@ -827,6 +1174,21 @@
|
|||
"F": "p.a."
|
||||
}
|
||||
],
|
||||
"p.s.": [
|
||||
{
|
||||
"F": "p.s."
|
||||
}
|
||||
],
|
||||
"pers.": [
|
||||
{
|
||||
"F": "pers."
|
||||
}
|
||||
],
|
||||
"phil.": [
|
||||
{
|
||||
"F": "phil."
|
||||
}
|
||||
],
|
||||
"q.": [
|
||||
{
|
||||
"F": "q."
|
||||
|
@ -847,6 +1209,11 @@
|
|||
"F": "rer."
|
||||
}
|
||||
],
|
||||
"r\u00f6m.": [
|
||||
{
|
||||
"F": "r\u00f6m."
|
||||
}
|
||||
],
|
||||
"s'": [
|
||||
{
|
||||
"F": "s'",
|
||||
|
@ -858,6 +1225,11 @@
|
|||
"F": "s."
|
||||
}
|
||||
],
|
||||
"s.o.": [
|
||||
{
|
||||
"F": "s.o."
|
||||
}
|
||||
],
|
||||
"sen.": [
|
||||
{
|
||||
"F": "sen."
|
||||
|
@ -865,23 +1237,49 @@
|
|||
],
|
||||
"sie's": [
|
||||
{
|
||||
"F": "sie"
|
||||
"F": "sie",
|
||||
"L": "sie"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
"L": "es"
|
||||
}
|
||||
],
|
||||
"sog.": [
|
||||
{
|
||||
"F": "sog."
|
||||
}
|
||||
],
|
||||
"std.": [
|
||||
{
|
||||
"F": "std."
|
||||
}
|
||||
],
|
||||
"stellv.": [
|
||||
{
|
||||
"F": "stellv."
|
||||
}
|
||||
],
|
||||
"t.": [
|
||||
{
|
||||
"F": "t."
|
||||
}
|
||||
],
|
||||
"t\u00e4gl.": [
|
||||
{
|
||||
"F": "t\u00e4gl."
|
||||
}
|
||||
],
|
||||
"u.": [
|
||||
{
|
||||
"F": "u."
|
||||
}
|
||||
],
|
||||
"u.U.": [
|
||||
{
|
||||
"F": "u.U."
|
||||
}
|
||||
],
|
||||
"u.a.": [
|
||||
{
|
||||
"F": "u.a."
|
||||
|
@ -892,28 +1290,75 @@
|
|||
"F": "u.s.w."
|
||||
}
|
||||
],
|
||||
"u.v.m.": [
|
||||
{
|
||||
"F": "u.v.m."
|
||||
}
|
||||
],
|
||||
"unter'm": [
|
||||
{
|
||||
"F": "unter"
|
||||
"F": "unter",
|
||||
"L": "unter"
|
||||
},
|
||||
{
|
||||
"F": "'m",
|
||||
"L": "dem"
|
||||
}
|
||||
],
|
||||
"usf.": [
|
||||
{
|
||||
"F": "usf."
|
||||
}
|
||||
],
|
||||
"usw.": [
|
||||
{
|
||||
"F": "usw."
|
||||
}
|
||||
],
|
||||
"uvm.": [
|
||||
{
|
||||
"F": "uvm."
|
||||
}
|
||||
],
|
||||
"v.": [
|
||||
{
|
||||
"F": "v."
|
||||
}
|
||||
],
|
||||
"v.Chr.": [
|
||||
{
|
||||
"F": "v.Chr."
|
||||
}
|
||||
],
|
||||
"v.a.": [
|
||||
{
|
||||
"F": "v.a."
|
||||
}
|
||||
],
|
||||
"v.l.n.r.": [
|
||||
{
|
||||
"F": "v.l.n.r."
|
||||
}
|
||||
],
|
||||
"vgl.": [
|
||||
{
|
||||
"F": "vgl."
|
||||
}
|
||||
],
|
||||
"vllt.": [
|
||||
{
|
||||
"F": "vllt."
|
||||
}
|
||||
],
|
||||
"vlt.": [
|
||||
{
|
||||
"F": "vlt."
|
||||
}
|
||||
],
|
||||
"vor'm": [
|
||||
{
|
||||
"F": "vor"
|
||||
"F": "vor",
|
||||
"L": "vor"
|
||||
},
|
||||
{
|
||||
"F": "'m",
|
||||
|
@ -932,13 +1377,19 @@
|
|||
],
|
||||
"wir's": [
|
||||
{
|
||||
"F": "wir"
|
||||
"F": "wir",
|
||||
"L": "wir"
|
||||
},
|
||||
{
|
||||
"F": "'s",
|
||||
"L": "es"
|
||||
}
|
||||
],
|
||||
"wiss.": [
|
||||
{
|
||||
"F": "wiss."
|
||||
}
|
||||
],
|
||||
"x.": [
|
||||
{
|
||||
"F": "x."
|
||||
|
@ -969,19 +1420,60 @@
|
|||
"F": "z.B."
|
||||
}
|
||||
],
|
||||
"z.Bsp.": [
|
||||
{
|
||||
"F": "z.Bsp."
|
||||
}
|
||||
],
|
||||
"z.T.": [
|
||||
{
|
||||
"F": "z.T."
|
||||
}
|
||||
],
|
||||
"z.Z.": [
|
||||
{
|
||||
"F": "z.Z."
|
||||
}
|
||||
],
|
||||
"z.Zt.": [
|
||||
{
|
||||
"F": "z.Zt."
|
||||
}
|
||||
],
|
||||
"z.b.": [
|
||||
{
|
||||
"F": "z.b."
|
||||
}
|
||||
],
|
||||
"zzgl.": [
|
||||
{
|
||||
"F": "zzgl."
|
||||
}
|
||||
],
|
||||
"\u00e4.": [
|
||||
{
|
||||
"F": "\u00e4."
|
||||
}
|
||||
],
|
||||
"\u00f6.": [
|
||||
{
|
||||
"F": "\u00f6."
|
||||
}
|
||||
],
|
||||
"\u00f6sterr.": [
|
||||
{
|
||||
"F": "\u00f6sterr."
|
||||
}
|
||||
],
|
||||
"\u00fc.": [
|
||||
{
|
||||
"F": "\u00fc."
|
||||
}
|
||||
],
|
||||
"\u00fcber'm": [
|
||||
{
|
||||
"F": "\u00fcber"
|
||||
"F": "\u00fcber",
|
||||
"L": "\u00fcber"
|
||||
},
|
||||
{
|
||||
"F": "'m",
|
||||
|
|
|
@ -13,14 +13,61 @@
|
|||
;
|
||||
'
|
||||
”
|
||||
“
|
||||
«
|
||||
_
|
||||
''
|
||||
's
|
||||
'S
|
||||
’s
|
||||
’S
|
||||
’
|
||||
‘
|
||||
°
|
||||
€
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9)\]"'%\)])\.
|
||||
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
|
||||
\-\-
|
||||
´
|
||||
(?<=[0-9])km²
|
||||
(?<=[0-9])m²
|
||||
(?<=[0-9])cm²
|
||||
(?<=[0-9])mm²
|
||||
(?<=[0-9])km³
|
||||
(?<=[0-9])m³
|
||||
(?<=[0-9])cm³
|
||||
(?<=[0-9])mm³
|
||||
(?<=[0-9])ha
|
||||
(?<=[0-9])km
|
||||
(?<=[0-9])m
|
||||
(?<=[0-9])cm
|
||||
(?<=[0-9])mm
|
||||
(?<=[0-9])µm
|
||||
(?<=[0-9])nm
|
||||
(?<=[0-9])yd
|
||||
(?<=[0-9])in
|
||||
(?<=[0-9])ft
|
||||
(?<=[0-9])kg
|
||||
(?<=[0-9])g
|
||||
(?<=[0-9])mg
|
||||
(?<=[0-9])µg
|
||||
(?<=[0-9])t
|
||||
(?<=[0-9])lb
|
||||
(?<=[0-9])oz
|
||||
(?<=[0-9])m/s
|
||||
(?<=[0-9])km/h
|
||||
(?<=[0-9])mph
|
||||
(?<=[0-9])°C
|
||||
(?<=[0-9])°K
|
||||
(?<=[0-9])°F
|
||||
(?<=[0-9])hPa
|
||||
(?<=[0-9])Pa
|
||||
(?<=[0-9])mbar
|
||||
(?<=[0-9])mb
|
||||
(?<=[0-9])T
|
||||
(?<=[0-9])G
|
||||
(?<=[0-9])M
|
||||
(?<=[0-9])K
|
||||
(?<=[0-9])kb
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"name": "en_default",
|
||||
"name": "en",
|
||||
"version": "1.0.0",
|
||||
"description": "english test model",
|
||||
"license": "public domain",
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
cython
|
||||
cymem>=1.30,<1.32
|
||||
pathlib
|
||||
numpy
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=0.46.1,<0.47.0
|
||||
thinc>=5.0.0,<5.1.0
|
||||
murmurhash>=0.26,<0.27
|
||||
text-unidecode
|
||||
numpy
|
||||
plac
|
||||
six
|
||||
ujson
|
||||
cloudpickle
|
||||
sputnik>=0.8.0,<0.9.0
|
||||
sputnik>=0.9.2,<0.10.0
|
||||
|
|
|
@ -1,340 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
|
||||
import falcon
|
||||
import json
|
||||
from os import path
|
||||
from collections import defaultdict
|
||||
import pprint
|
||||
import numpy
|
||||
|
||||
import spacy.en
|
||||
from spacy.attrs import ORTH, SPACY, TAG, POS, ENT_IOB, ENT_TYPE
|
||||
from spacy.parts_of_speech import NAMES as UNIV_POS_NAMES
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
unicode = str
|
||||
|
||||
|
||||
NLU = spacy.en.English()
|
||||
|
||||
|
||||
def merge_entities(doc):
|
||||
ents = [(e[0].idx, e[len(e)-1].idx + len(e[len(e)-1]), e.label_, e.text)
|
||||
for e in doc.ents if len(e) >= 2]
|
||||
for start, end, label, lemma in ents:
|
||||
merged = doc.merge(start, end, label, text, label)
|
||||
assert merged != None
|
||||
|
||||
|
||||
def merge_nps(doc):
|
||||
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.root.tag_, np.text)
|
||||
for np in doc.noun_chunks if len(np) >= 2]
|
||||
|
||||
for start, end, ent_type, lemma in nps:
|
||||
doc.merge(start, end, u'NP', lemma, ent_type)
|
||||
|
||||
|
||||
def merge_punct(tokens):
|
||||
# Merge punctuation onto its head
|
||||
collect = False
|
||||
start = None
|
||||
merges = []
|
||||
|
||||
for word in tokens:
|
||||
if word.whitespace_:
|
||||
if collect:
|
||||
span = tokens[start:word.i+1]
|
||||
if len(span) >= 2:
|
||||
merges.append((
|
||||
span[0].idx,
|
||||
span[-1].idx + len(span[-1]),
|
||||
span.root.tag_,
|
||||
span.root.lemma_,
|
||||
span.root.ent_type_))
|
||||
collect = False
|
||||
start = None
|
||||
elif not collect:
|
||||
collect = True
|
||||
start = word.i
|
||||
if collect:
|
||||
span = tokens[start:len(tokens)]
|
||||
merges.append((span[0].idx, span[-1].idx + len(span[-1]),
|
||||
span.root.tag_, span.root.lemma_, span.root.ent_type_))
|
||||
for merge in merges:
|
||||
tokens.merge(*merge)
|
||||
|
||||
|
||||
def get_actions(parse_state, n_actions):
|
||||
actions = []
|
||||
queue = list(sorted(parse_state.queue))
|
||||
stack = list(sorted(parse_state.stack))
|
||||
stack = []
|
||||
actions.append({'label': 'shift', 'key': 'S', 'binding': 38,
|
||||
'is_valid': NLU.parser.moves.is_valid(parse_state, 'S')})
|
||||
actions.append({'label': 'left', 'key': 'L', 'binding': 37,
|
||||
'is_valid': NLU.parser.moves.is_valid(parse_state, 'L-det')})
|
||||
actions.append({'label': 'predict', 'key': '_', 'binding': 32,
|
||||
'is_valid': bool(parse_state.queue or parse_state.stack)})
|
||||
actions.append({'label': 'right', 'key': 'R', 'binding': 39,
|
||||
'is_valid': NLU.parser.moves.is_valid(parse_state, 'R-dobj')})
|
||||
actions.append({'label': 'undo', 'key': '-', 'binding': 8,
|
||||
'is_valid': n_actions != 0})
|
||||
actions.append({'label': 'reduce', 'key': 'D', 'binding': 40,
|
||||
'is_valid': NLU.parser.moves.is_valid(parse_state, 'D')})
|
||||
return actions
|
||||
|
||||
|
||||
class Model(object):
|
||||
def to_json(self):
|
||||
return {name: _as_json(value) for name, value in self.__dict__.items()
|
||||
if not name.startswith('_')}
|
||||
|
||||
def _as_json(value):
|
||||
if hasattr(value, 'to_json'):
|
||||
return value.to_json()
|
||||
elif isinstance(value, list):
|
||||
return [_as_json(v) for v in value]
|
||||
elif isinstance(value, set):
|
||||
return {key: True for key in value}
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def _parse_history(history):
|
||||
if history and history.endswith(','):
|
||||
history = history[:-1]
|
||||
history = history.strip().split(',') if history else tuple()
|
||||
new_hist = []
|
||||
history_length = len(history)
|
||||
for action in history:
|
||||
if action == '-':
|
||||
if new_hist:
|
||||
new_hist.pop()
|
||||
else:
|
||||
new_hist.append(action)
|
||||
return new_hist, history_length
|
||||
|
||||
|
||||
def apply_edits(tokens, word_edits, tag_edits):
|
||||
new_words = []
|
||||
attrs = (POS, ENT_TYPE, ENT_IOB)
|
||||
new_analysis = numpy.zeros(shape=(len(tokens), len(attrs)), dtype=numpy.int32)
|
||||
for word in tokens:
|
||||
key = str(word.i)
|
||||
new_words.append(word_edits.get(key, word.orth_))
|
||||
tag = tag_edits.get(key, word.pos_)
|
||||
if tag in UNIV_POS_NAMES:
|
||||
new_analysis[word.i, 0] = UNIV_POS_NAMES[tag]
|
||||
# Set ent_type=0 and IOB="O"
|
||||
new_analysis[word.i, 1] = 0
|
||||
new_analysis[word.i, 2] = 2
|
||||
else:
|
||||
new_analysis[word.i, 0] = word.pos
|
||||
new_analysis[word.i, 1] = NLU.vocab.strings[tag]
|
||||
new_analysis[word.i, 2] = 3
|
||||
|
||||
doc = NLU.tokenizer.tokens_from_list(new_words)
|
||||
doc.from_array(attrs, new_analysis)
|
||||
NLU.parser(doc)
|
||||
return doc
|
||||
|
||||
|
||||
class Parse(Model):
|
||||
def __init__(self, doc, states, actions, **kwargs):
|
||||
word_edits = kwargs.get('words', {})
|
||||
tag_edits = kwargs.get('tags', {})
|
||||
if word_edits or tag_edits:
|
||||
doc = apply_edits(doc, word_edits, tag_edits)
|
||||
notes = kwargs.get('notes', {})
|
||||
self.actions = actions
|
||||
self.words = [Word(w, w.i in word_edits, w.i in tag_edits) for w in doc]
|
||||
self.states = states
|
||||
self.notes = notes
|
||||
for word in doc:
|
||||
print(word.orth_, word.head.orth_)
|
||||
|
||||
@classmethod
|
||||
def from_text(cls, text, **kwargs):
|
||||
tokens = NLU(text)
|
||||
#merge_entities(tokens)
|
||||
merge_nps(tokens)
|
||||
#merge_punct(tokens)
|
||||
return cls(tokens, [State.from_doc(tokens)], [], **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_history(cls, text, history, **kwargs):
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf8')
|
||||
text = text.replace('-SLASH-', '/')
|
||||
history, history_length = _parse_history(history)
|
||||
|
||||
tokens = NLU.tokenizer(text)
|
||||
NLU.tagger(tokens)
|
||||
NLU.matcher(tokens)
|
||||
|
||||
with NLU.parser.step_through(tokens) as state:
|
||||
for action in history:
|
||||
state.transition(action)
|
||||
|
||||
NLU.entity(tokens)
|
||||
actions = get_actions(state.stcls, len(history))
|
||||
return Parse(tokens, [State(state.heads, state.deps, state.stack, state.queue)],
|
||||
actions, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def with_history(cls, text):
|
||||
tokens = NLU.tokenizer(text)
|
||||
NLU.tagger(tokens)
|
||||
NLU.matcher(tokens)
|
||||
|
||||
with NLU.parser.step_through(tokens) as state:
|
||||
states = []
|
||||
while not state.is_final:
|
||||
action = state.predict()
|
||||
state.transition(action)
|
||||
states.append(State(state.heads, state.deps, state.stack, state.queue))
|
||||
actions = [
|
||||
{'label': 'prev', 'key': 'P', 'binding': 37, 'is_valid': True},
|
||||
{'label': 'next', 'key': 'N', 'binding': 39, 'is_valid': True}
|
||||
]
|
||||
return Parse(state.doc, states, actions)
|
||||
|
||||
|
||||
class Word(Model):
|
||||
def __init__(self, token, is_w_edit=False, is_t_edit=False):
|
||||
self.word = token.orth_
|
||||
self.tag = token.pos_
|
||||
self.tag = token.pos_ if not token.ent_type_ else token.ent_type_
|
||||
self.is_entity = token.ent_iob in (1, 3)
|
||||
self.is_w_edit = is_w_edit
|
||||
self.is_t_edit = is_t_edit
|
||||
self.prob = token.prob
|
||||
|
||||
|
||||
class State(Model):
|
||||
def __init__(self, heads, deps, stack, queue):
|
||||
Model.__init__(self)
|
||||
|
||||
queue = [w for w in queue if w >= 0]
|
||||
self.focus = min(queue) if queue else -1
|
||||
self.is_final = bool(not stack and not queue)
|
||||
self.stack = set(stack)
|
||||
self.arrows = self._get_arrows(heads, deps)
|
||||
|
||||
@classmethod
|
||||
def from_doc(cls, doc):
|
||||
return cls([w.head.i for w in doc], [w.dep_ for w in doc], [], [])
|
||||
|
||||
def _get_arrows(self, heads, deps):
|
||||
arcs = defaultdict(dict)
|
||||
for i, (head, dep) in enumerate(zip(heads, deps)):
|
||||
if i < head:
|
||||
arcs[head - i][i] = Arrow(i, head, dep)
|
||||
elif i > head:
|
||||
arcs[i - head][head] = Arrow(i, head, dep)
|
||||
output = []
|
||||
for level in range(1, len(heads)):
|
||||
level_arcs = []
|
||||
for i in range(len(heads) - level):
|
||||
level_arcs.append(arcs[level].get(i))
|
||||
output.append(level_arcs)
|
||||
while output and all(arc is None for arc in output[-1]):
|
||||
output.pop()
|
||||
return output
|
||||
|
||||
|
||||
class Arrow(Model):
|
||||
def __init__(self, word, head, label):
|
||||
self.dir = 'left' if head > word else 'right'
|
||||
self.label = label
|
||||
|
||||
|
||||
class Endpoint(object):
|
||||
def set_header(self, resp):
|
||||
resp.content_type = 'text/string'
|
||||
resp.append_header('Access-Control-Allow-Origin', "*")
|
||||
resp.status = falcon.HTTP_200
|
||||
|
||||
def set_body(self, resp, parse):
|
||||
resp.body = json.dumps(parse.to_json(), indent=4)
|
||||
|
||||
def on_get(self, req, resp, text):
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf8')
|
||||
self.set_body(resp, self.get_parse(text))
|
||||
self.set_header(resp)
|
||||
|
||||
def on_post(self, req, resp):
|
||||
try:
|
||||
body_bytes = req.stream.read()
|
||||
json_data = json.loads(body_bytes.decode('utf8'))
|
||||
text = json_data['text']
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf8')
|
||||
self.set_body(resp, self.get_parse(text))
|
||||
self.set_header(resp)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
class ParseEP(Endpoint):
|
||||
def get_parse(self, text, **kwargs):
|
||||
return Parse.from_text(text, **kwargs)
|
||||
|
||||
|
||||
class StepsEP(Endpoint):
|
||||
def get_parse(self, text):
|
||||
print('Step=', repr(text))
|
||||
return Parse.with_history(text)
|
||||
|
||||
|
||||
class ManualEP(Endpoint):
|
||||
def get_parse(self, text, **kwargs):
|
||||
print('Manual=', repr(text))
|
||||
if '/' in text:
|
||||
text, actions = text.rsplit('/', 1)
|
||||
else:
|
||||
actions = ''
|
||||
return Parse.from_history(text, actions, **kwargs)
|
||||
|
||||
def on_get(self, req, resp, text, actions=''):
|
||||
if not isinstance(text, unicode):
|
||||
text = text.decode('utf8')
|
||||
self.set_body(resp, self.get_parse(text + '/' + actions))
|
||||
self.set_header(resp)
|
||||
|
||||
def on_post(self, req, resp):
|
||||
self.set_header(resp)
|
||||
body_bytes = req.stream.read()
|
||||
json_data = json.loads(body_bytes.decode('utf8'))
|
||||
print(json_data)
|
||||
params = json_data.get('params', {})
|
||||
self.set_body(resp, self.get_parse(json_data['text'], **params))
|
||||
|
||||
|
||||
app = falcon.API()
|
||||
|
||||
remote_man = ManualEP()
|
||||
remote_parse = ParseEP()
|
||||
remote_steps = StepsEP()
|
||||
|
||||
app.add_route('/api/displacy/parse/', remote_parse)
|
||||
app.add_route('/api/displacy/parse/{text}/', remote_parse)
|
||||
|
||||
app.add_route('/api/displacy/steps/', remote_steps)
|
||||
app.add_route('/api/displacy/steps/{text}/', remote_steps)
|
||||
|
||||
app.add_route('/api/displacy/manual/', remote_man)
|
||||
app.add_route('/api/displacy/manual/{text}/', remote_man)
|
||||
app.add_route('/api/displacy/manual/{text}/{actions}', remote_man)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text, actions = open(sys.argv[1]).read().strip().split('\n')
|
||||
parse = Parse.from_text(text)
|
||||
pprint.pprint(parse.to_json())
|
46
setup.py
46
setup.py
|
@ -1,7 +1,6 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import contextlib
|
||||
|
@ -26,13 +25,15 @@ PACKAGES = [
|
|||
'spacy.tests.morphology',
|
||||
'spacy.tests.munge',
|
||||
'spacy.tests.parser',
|
||||
'spacy.tests.print',
|
||||
'spacy.tests.serialize',
|
||||
'spacy.tests.spans',
|
||||
'spacy.tests.tagger',
|
||||
'spacy.tests.tokenizer',
|
||||
'spacy.tests.tokens',
|
||||
'spacy.tests.vectors',
|
||||
'spacy.tests.vocab']
|
||||
'spacy.tests.vocab',
|
||||
'spacy.tests.website']
|
||||
|
||||
|
||||
MOD_NAMES = [
|
||||
|
@ -47,6 +48,7 @@ MOD_NAMES = [
|
|||
'spacy.syntax._state',
|
||||
'spacy.tokenizer',
|
||||
'spacy.syntax.parser',
|
||||
'spacy.syntax.nonproj',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
'spacy.syntax._parse_features',
|
||||
|
@ -143,10 +145,13 @@ def setup_package():
|
|||
return clean(root)
|
||||
|
||||
with chdir(root):
|
||||
about = {}
|
||||
with open(os.path.join(root, "spacy", "about.py")) as f:
|
||||
with open(os.path.join(root, 'spacy', 'about.py')) as f:
|
||||
about = {}
|
||||
exec(f.read(), about)
|
||||
|
||||
with open(os.path.join(root, 'README.rst')) as f:
|
||||
readme = f.read()
|
||||
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
os.path.join(root, 'include')]
|
||||
|
@ -162,20 +167,45 @@ def setup_package():
|
|||
generate_cython(root, 'spacy')
|
||||
|
||||
setup(
|
||||
name=about['__name__'],
|
||||
name=about['__title__'],
|
||||
zip_safe=False,
|
||||
packages=PACKAGES,
|
||||
package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']},
|
||||
description=about['__summary__'],
|
||||
long_description=readme,
|
||||
author=about['__author__'],
|
||||
author_email=about['__email__'],
|
||||
version=about['__version__'],
|
||||
url=about['__uri__'],
|
||||
license=about['__license__'],
|
||||
ext_modules=ext_modules,
|
||||
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.32.0', 'preshed>=0.46.1,<0.47',
|
||||
'thinc>=5.0.0,<5.1.0', 'text_unidecode', 'plac', 'six',
|
||||
'ujson', 'cloudpickle', 'sputnik>=0.8.0,<0.9.0'],
|
||||
install_requires=[
|
||||
'numpy',
|
||||
'murmurhash>=0.26,<0.27',
|
||||
'cymem>=1.30,<1.32.0',
|
||||
'preshed>=0.46.1,<0.47',
|
||||
'thinc>=5.0.0,<5.1.0',
|
||||
'plac',
|
||||
'six',
|
||||
'ujson',
|
||||
'cloudpickle',
|
||||
'sputnik>=0.9.2,<0.10.0'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Operating System :: POSIX :: Linux',
|
||||
'Operating System :: MacOS :: MacOS X',
|
||||
'Operating System :: Microsoft :: Windows',
|
||||
'Programming Language :: Cython',
|
||||
'Programming Language :: Python :: 2.6',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3.3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Topic :: Scientific/Engineering'],
|
||||
cmdclass = {
|
||||
'build_ext': build_ext_subclass},
|
||||
)
|
||||
|
|
|
@ -2,6 +2,7 @@ from . import util
|
|||
from .en import English
|
||||
|
||||
|
||||
def load(name, via=None):
|
||||
package = util.get_package_by_name(name, via=via)
|
||||
return English(package=package)
|
||||
def load(name, vectors=None, via=None):
|
||||
return English(
|
||||
package=util.get_package_by_name(name, via=via),
|
||||
vectors_package=util.get_package_by_name(vectors, via=via))
|
||||
|
|
|
@ -3,12 +3,11 @@
|
|||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__name__ = 'spacy'
|
||||
__version__ = '0.100.5'
|
||||
__title__ = 'spacy'
|
||||
__version__ = '0.100.6'
|
||||
__summary__ = 'Industrial-strength NLP'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
__email__ = 'matt@spacy.io'
|
||||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
__default_model__ = 'en_default'
|
||||
__default_model__ = 'en>=1.0.0,<1.1.0'
|
||||
|
|
|
@ -6,4 +6,4 @@ from ..language import Language
|
|||
|
||||
|
||||
class German(Language):
|
||||
pass
|
||||
lang = 'de'
|
||||
|
|
|
@ -29,20 +29,20 @@ def migrate(path):
|
|||
)
|
||||
def main(data_size='all', force=False):
|
||||
if force:
|
||||
sputnik.purge(about.__name__, about.__version__)
|
||||
sputnik.purge(about.__title__, about.__version__)
|
||||
|
||||
try:
|
||||
sputnik.package(about.__name__, about.__version__, about.__default_model__)
|
||||
sputnik.package(about.__title__, about.__version__, about.__default_model__)
|
||||
print("Model already installed. Please run 'python -m "
|
||||
"spacy.en.download --force' to reinstall.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||
pass
|
||||
|
||||
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
|
||||
package = sputnik.install(about.__title__, about.__version__, about.__default_model__)
|
||||
|
||||
try:
|
||||
sputnik.package(about.__name__, about.__version__, about.__default_model__)
|
||||
sputnik.package(about.__title__, about.__version__, about.__default_model__)
|
||||
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||
print("Model failed to install. Please run 'python -m "
|
||||
"spacy.en.download --force'.", file=sys.stderr)
|
||||
|
|
|
@ -14,6 +14,8 @@ try:
|
|||
except ImportError:
|
||||
import json
|
||||
|
||||
from .syntax import nonproj
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
|
@ -237,33 +239,13 @@ cdef class GoldParse:
|
|||
self.labels[i] = annot_tuples[4][gold_i]
|
||||
self.ner[i] = annot_tuples[5][gold_i]
|
||||
|
||||
# If we have any non-projective arcs, i.e. crossing brackets, consider
|
||||
# the heads for those words missing in the gold-standard.
|
||||
# This way, we can train from these sentences
|
||||
cdef int w1, w2, h1, h2
|
||||
if make_projective:
|
||||
heads = list(self.heads)
|
||||
for w1 in range(self.length):
|
||||
if heads[w1] is not None:
|
||||
h1 = heads[w1]
|
||||
for w2 in range(w1+1, self.length):
|
||||
if heads[w2] is not None:
|
||||
h2 = heads[w2]
|
||||
if _arcs_cross(w1, h1, w2, h2):
|
||||
self.heads[w1] = None
|
||||
self.labels[w1] = ''
|
||||
self.heads[w2] = None
|
||||
self.labels[w2] = ''
|
||||
cycle = nonproj.contains_cycle(self.heads)
|
||||
if cycle != None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
|
||||
# Check there are no cycles in the dependencies, i.e. we are a tree
|
||||
for w in range(self.length):
|
||||
seen = set([w])
|
||||
head = w
|
||||
while self.heads[head] != head and self.heads[head] != None:
|
||||
head = self.heads[head]
|
||||
if head in seen:
|
||||
raise Exception("Cycle found: %s" % seen)
|
||||
seen.add(head)
|
||||
if make_projective:
|
||||
proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels)
|
||||
self.heads = proj_heads
|
||||
|
||||
self.brackets = {}
|
||||
for (gold_start, gold_end, label_str) in brackets:
|
||||
|
@ -278,25 +260,18 @@ cdef class GoldParse:
|
|||
|
||||
@property
|
||||
def is_projective(self):
|
||||
heads = list(self.heads)
|
||||
for w1 in range(self.length):
|
||||
if heads[w1] is not None:
|
||||
h1 = heads[w1]
|
||||
for w2 in range(self.length):
|
||||
if heads[w2] is not None and _arcs_cross(w1, h1, w2, heads[w2]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
cdef int _arcs_cross(int w1, int h1, int w2, int h2) except -1:
|
||||
if w1 > h1:
|
||||
w1, h1 = h1, w1
|
||||
if w2 > h2:
|
||||
w2, h2 = h2, w2
|
||||
if w1 > w2:
|
||||
w1, h1, w2, h2 = w2, h2, w1, h1
|
||||
return w1 < w2 < h1 < h2 or w1 < w2 == h2 < h1
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def is_punct_label(label):
|
||||
return label == 'P' or label.lower() == 'punct'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -153,7 +153,7 @@ class Language(object):
|
|||
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||
|
||||
@classmethod
|
||||
def default_vocab(cls, package, get_lex_attr=None):
|
||||
def default_vocab(cls, package, get_lex_attr=None, vectors_package=None):
|
||||
if get_lex_attr is None:
|
||||
if package.has_file('vocab', 'oov_prob'):
|
||||
with package.open(('vocab', 'oov_prob')) as file_:
|
||||
|
@ -162,7 +162,8 @@ class Language(object):
|
|||
else:
|
||||
get_lex_attr = cls.default_lex_attrs()
|
||||
if hasattr(package, 'dir_path'):
|
||||
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
|
||||
return Vocab.from_package(package, get_lex_attr=get_lex_attr,
|
||||
vectors_package=vectors_package)
|
||||
else:
|
||||
return Vocab.load(package, get_lex_attr)
|
||||
|
||||
|
@ -198,7 +199,8 @@ class Language(object):
|
|||
matcher=None,
|
||||
serializer=None,
|
||||
load_vectors=True,
|
||||
package=None):
|
||||
package=None,
|
||||
vectors_package=None):
|
||||
"""
|
||||
a model can be specified:
|
||||
|
||||
|
@ -228,7 +230,7 @@ class Language(object):
|
|||
warn("load_vectors is deprecated", DeprecationWarning)
|
||||
|
||||
if vocab in (None, True):
|
||||
vocab = self.default_vocab(package)
|
||||
vocab = self.default_vocab(package, vectors_package=vectors_package)
|
||||
self.vocab = vocab
|
||||
if tokenizer in (None, True):
|
||||
tokenizer = Tokenizer.from_package(package, self.vocab)
|
||||
|
|
|
@ -10,4 +10,3 @@ cpdef bint like_email(unicode string)
|
|||
cpdef bint like_url(unicode string)
|
||||
cpdef bint like_number(unicode string)
|
||||
cpdef unicode word_shape(unicode string)
|
||||
cpdef bytes asciied(unicode string)
|
||||
|
|
|
@ -2,21 +2,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
import unicodedata
|
||||
|
||||
# If your license is not GPL compatible, use text_unidecode. But if your code
|
||||
# is, you should use the unidecode library, because its performance is better.
|
||||
# spaCy does not list unidecode as a dependency, in case your license is not
|
||||
# GPL compatible.
|
||||
try:
|
||||
from unidecode import unidecode
|
||||
except ImportError:
|
||||
from text_unidecode import unidecode
|
||||
|
||||
|
||||
import re
|
||||
|
||||
import math
|
||||
|
||||
|
||||
TAGS = 'adj adp adv conj det noun num pdt pos pron prt punct verb'.upper().split()
|
||||
|
||||
|
@ -171,23 +158,6 @@ cpdef unicode word_shape(unicode string):
|
|||
return ''.join(shape)
|
||||
|
||||
|
||||
cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
|
||||
"""Apply level 1 normalization:
|
||||
|
||||
* Case is canonicalized, using frequency statistics
|
||||
* Unicode mapped to ascii, via unidecode
|
||||
* Regional spelling variations are normalized
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
cpdef bytes asciied(unicode string):
|
||||
stripped = unidecode(string)
|
||||
if not stripped:
|
||||
return b'???'
|
||||
return stripped.encode('ascii')
|
||||
|
||||
|
||||
# Exceptions --- do not convert these
|
||||
_uk_us_except = set([
|
||||
'our',
|
||||
|
|
|
@ -23,10 +23,8 @@ import ujson as json
|
|||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
# This has to be like this for
|
||||
chars = <char*>PyUnicode_AS_DATA(string)
|
||||
size = PyUnicode_GET_DATA_SIZE(string)
|
||||
return hash64(chars, size, 1)
|
||||
chars = string.encode('utf8')
|
||||
return hash64(<char*>chars, len(chars), 1)
|
||||
|
||||
|
||||
cdef unicode _decode(const Utf8Str* string):
|
||||
|
@ -120,6 +118,11 @@ cdef class StringStore:
|
|||
else:
|
||||
raise TypeError(type(string_or_id))
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
cdef hash_t key = hash_string(string)
|
||||
value = <Utf8Str*>self._map.get(key)
|
||||
return True if value is not NULL else False
|
||||
|
||||
def __iter__(self):
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
|
|
0
spacy/syntax/nonproj.pxd
Normal file
0
spacy/syntax/nonproj.pxd
Normal file
201
spacy/syntax/nonproj.pyx
Normal file
201
spacy/syntax/nonproj.pyx
Normal file
|
@ -0,0 +1,201 @@
|
|||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from spacy.attrs import DEP, HEAD
|
||||
|
||||
|
||||
def ancestors(tokenid, heads):
|
||||
# returns all words going from the word up the path to the root
|
||||
# the path to root cannot be longer than the number of words in the sentence
|
||||
# this function ends after at most len(heads) steps
|
||||
# because it would otherwise loop indefinitely on cycles
|
||||
head = tokenid
|
||||
cnt = 0
|
||||
while heads[head] != head and cnt < len(heads):
|
||||
head = heads[head]
|
||||
cnt += 1
|
||||
yield head
|
||||
if head == None:
|
||||
break
|
||||
|
||||
|
||||
def contains_cycle(heads):
|
||||
# in an acyclic tree, the path from each word following
|
||||
# the head relation upwards always ends at the root node
|
||||
for tokenid in range(len(heads)):
|
||||
seen = set([tokenid])
|
||||
for ancestor in ancestors(tokenid,heads):
|
||||
if ancestor in seen:
|
||||
return seen
|
||||
seen.add(ancestor)
|
||||
return None
|
||||
|
||||
|
||||
def is_nonproj_arc(tokenid, heads):
|
||||
# definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
|
||||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
head = heads[tokenid]
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
return False
|
||||
elif head == None: # unattached tokens cannot be non-projective
|
||||
return False
|
||||
|
||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||
for k in range(start,end):
|
||||
for ancestor in ancestors(k,heads):
|
||||
if ancestor == None: # for unattached tokens/subtrees
|
||||
break
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
break
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_nonproj_tree(heads):
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
||||
|
||||
|
||||
class PseudoProjectivity:
|
||||
# implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
# for doing pseudo-projective parsing
|
||||
# implementation uses the HEAD decoration scheme
|
||||
|
||||
delimiter = '||'
|
||||
|
||||
@classmethod
|
||||
def decompose(cls, label):
|
||||
return label.partition(cls.delimiter)[::2]
|
||||
|
||||
@classmethod
|
||||
def is_decorated(cls, label):
|
||||
return label.find(cls.delimiter) != -1
|
||||
|
||||
@classmethod
|
||||
def preprocess_training_data(cls, gold_tuples, label_freq_cutoff=30):
|
||||
preprocessed = []
|
||||
freqs = {}
|
||||
for raw_text, sents in gold_tuples:
|
||||
prepro_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
proj_heads,deco_labels = cls.projectivize(heads,labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
||||
# count label frequencies
|
||||
if label_freq_cutoff > 0:
|
||||
for label in deco_labels:
|
||||
if cls.is_decorated(label):
|
||||
freqs[label] = freqs.get(label,0) + 1
|
||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
||||
preprocessed.append((raw_text, prepro_sents))
|
||||
|
||||
if label_freq_cutoff > 0:
|
||||
return cls._filter_labels(preprocessed,label_freq_cutoff,freqs)
|
||||
return preprocessed
|
||||
|
||||
|
||||
@classmethod
|
||||
def projectivize(cls, heads, labels):
|
||||
# use the algorithm by Nivre & Nilsson 2005
|
||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
||||
# returns a new pair (heads,labels) which encode
|
||||
# a projective and decorated tree
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc == None: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc != None:
|
||||
cls._lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = cls._get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = cls._decorate(heads, proj_heads, labels)
|
||||
return proj_heads, deco_labels
|
||||
|
||||
|
||||
@classmethod
|
||||
def deprojectivize(cls, Doc tokens):
|
||||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
parse = tokens.to_array([HEAD, DEP])
|
||||
labels = [ tokens.vocab.strings[int(p[1])] for p in parse ]
|
||||
for token in tokens:
|
||||
if cls.is_decorated(token.dep_):
|
||||
newlabel,headlabel = cls.decompose(token.dep_)
|
||||
newhead = cls._find_new_head(token,headlabel)
|
||||
parse[token.i,1] = tokens.vocab.strings[newlabel]
|
||||
parse[token.i,0] = newhead.i - token.i
|
||||
tokens.from_array([HEAD, DEP],parse)
|
||||
|
||||
|
||||
@classmethod
|
||||
def _decorate(cls, heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
deco_labels = []
|
||||
for tokenid,head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
deco_labels.append('%s%s%s' % (labels[tokenid],cls.delimiter,labels[head]))
|
||||
else:
|
||||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
||||
|
||||
@classmethod
|
||||
def _get_smallest_nonproj_arc(cls, heads):
|
||||
# return the smallest non-proj arc or None
|
||||
# where size is defined as the distance between dep and head
|
||||
# and ties are broken left to right
|
||||
smallest_size = float('inf')
|
||||
smallest_np_arc = None
|
||||
for tokenid,head in enumerate(heads):
|
||||
size = abs(tokenid-head)
|
||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
||||
smallest_size = size
|
||||
smallest_np_arc = tokenid
|
||||
return smallest_np_arc
|
||||
|
||||
|
||||
@classmethod
|
||||
def _lift(cls, tokenid, heads):
|
||||
# reattaches a word to it's grandfather
|
||||
head = heads[tokenid]
|
||||
ghead = heads[head]
|
||||
# attach to ghead if head isn't attached to root else attach to root
|
||||
heads[tokenid] = ghead if head != ghead else tokenid
|
||||
|
||||
|
||||
@classmethod
|
||||
def _find_new_head(cls, token, headlabel):
|
||||
# search through the tree starting from root
|
||||
# returns the id of the first descendant with the given label
|
||||
# if there is none, return the current head (no change)
|
||||
queue = [token.head]
|
||||
while queue:
|
||||
next_queue = []
|
||||
for qtoken in queue:
|
||||
for child in qtoken.children:
|
||||
if child == token:
|
||||
continue
|
||||
if child.dep_ == headlabel:
|
||||
return child
|
||||
next_queue.append(child)
|
||||
queue = next_queue
|
||||
return token.head
|
||||
|
||||
|
||||
@classmethod
|
||||
def _filter_labels(cls, gold_tuples, cutoff, freqs):
|
||||
# throw away infrequent decorated labels
|
||||
# can't learn them reliably anyway and keeps label set smaller
|
||||
filtered = []
|
||||
for raw_text, sents in gold_tuples:
|
||||
filtered_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
filtered_labels = [ cls.decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
return filtered
|
||||
|
||||
|
|
@ -15,5 +15,6 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
cdef class Parser:
|
||||
cdef readonly ParserModel model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef int _projectivize
|
||||
|
||||
cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
|
||||
|
|
|
@ -12,12 +12,12 @@ from cpython.exc cimport PyErr_CheckSignals
|
|||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
import random
|
||||
import os.path
|
||||
from os import path
|
||||
import shutil
|
||||
import json
|
||||
import sys
|
||||
from .nonproj import PseudoProjectivity
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
@ -79,9 +79,10 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
|
||||
|
||||
cdef class Parser:
|
||||
def __init__(self, StringStore strings, transition_system, ParserModel model):
|
||||
def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0):
|
||||
self.moves = transition_system
|
||||
self.model = model
|
||||
self._projectivize = projectivize
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, model_dir, strings, transition_system):
|
||||
|
@ -93,9 +94,10 @@ cdef class Parser:
|
|||
moves = transition_system(strings, cfg.labels)
|
||||
templates = get_templates(cfg.features)
|
||||
model = ParserModel(templates)
|
||||
project = cfg.projectivize if hasattr(cfg,'projectivize') else False
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
model.load(path.join(model_dir, 'model'))
|
||||
return cls(strings, moves, model)
|
||||
return cls(strings, moves, model, project)
|
||||
|
||||
@classmethod
|
||||
def load(cls, pkg_or_str_or_file, vocab):
|
||||
|
@ -114,6 +116,9 @@ cdef class Parser:
|
|||
tokens.is_parsed = True
|
||||
# Check for KeyboardInterrupt etc. Untested
|
||||
PyErr_CheckSignals()
|
||||
# projectivize output
|
||||
if self._projectivize:
|
||||
PseudoProjectivity.deprojectivize(tokens)
|
||||
|
||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||
cdef Pool mem = Pool()
|
||||
|
|
|
@ -143,7 +143,7 @@ cdef class Tagger:
|
|||
|
||||
@classmethod
|
||||
def blank(cls, vocab, templates):
|
||||
model = TaggerModel(N_CONTEXT_FIELDS, templates)
|
||||
model = TaggerModel(templates)
|
||||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
|
@ -153,10 +153,8 @@ cdef class Tagger:
|
|||
@classmethod
|
||||
def from_package(cls, pkg, vocab):
|
||||
# TODO: templates.json deprecated? not present in latest package
|
||||
templates = cls.default_templates()
|
||||
# templates = package.load_utf8(json.load,
|
||||
# 'pos', 'templates.json',
|
||||
# default=cls.default_templates())
|
||||
# templates = cls.default_templates()
|
||||
templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
|
||||
|
||||
model = TaggerModel(templates)
|
||||
if pkg.has_file('pos', 'model'):
|
||||
|
@ -221,7 +219,7 @@ cdef class Tagger:
|
|||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
for tag in gold_tag_strs:
|
||||
if tag not in self.tag_names:
|
||||
if tag != None and tag not in self.tag_names:
|
||||
msg = ("Unrecognized gold tag: %s. tag_map.json must contain all"
|
||||
"gold tags, to maintain coarse-grained mapping.")
|
||||
raise ValueError(msg % tag)
|
||||
|
@ -234,10 +232,9 @@ cdef class Tagger:
|
|||
nr_feat=self.model.nr_feat)
|
||||
for i in range(tokens.length):
|
||||
self.model.set_featuresC(&eg.c, tokens.c, i)
|
||||
eg.set_label(golds[i])
|
||||
eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
|
||||
self.model.set_scoresC(eg.c.scores,
|
||||
eg.c.features, eg.c.nr_feat)
|
||||
|
||||
self.model.updateC(&eg.c)
|
||||
|
||||
self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
|
||||
|
|
0
spacy/tests/__init__.py
Normal file
0
spacy/tests/__init__.py
Normal file
0
spacy/tests/matcher/__init__.py
Normal file
0
spacy/tests/matcher/__init__.py
Normal file
0
spacy/tests/morphology/__init__.py
Normal file
0
spacy/tests/morphology/__init__.py
Normal file
0
spacy/tests/munge/__init__.py
Normal file
0
spacy/tests/munge/__init__.py
Normal file
0
spacy/tests/parser/__init__.py
Normal file
0
spacy/tests/parser/__init__.py
Normal file
137
spacy/tests/parser/test_nonproj.py
Normal file
137
spacy/tests/parser/test_nonproj.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.attrs import DEP, HEAD
|
||||
import numpy
|
||||
|
||||
from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc, is_nonproj_tree, PseudoProjectivity
|
||||
|
||||
def test_ancestors():
|
||||
tree = [1,2,2,4,5,2,2]
|
||||
cyclic_tree = [1,2,2,4,5,3,2]
|
||||
partial_tree = [1,2,2,4,5,None,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert([ a for a in ancestors(3,tree) ] == [4,5,2])
|
||||
assert([ a for a in ancestors(3,cyclic_tree) ] == [4,5,3,4,5,3,4])
|
||||
assert([ a for a in ancestors(3,partial_tree) ] == [4,5,None])
|
||||
assert([ a for a in ancestors(17,multirooted_tree) ] == [])
|
||||
|
||||
def test_contains_cycle():
|
||||
tree = [1,2,2,4,5,2,2]
|
||||
cyclic_tree = [1,2,2,4,5,3,2]
|
||||
partial_tree = [1,2,2,4,5,None,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(contains_cycle(tree) == None)
|
||||
assert(contains_cycle(cyclic_tree) == set([3,4,5]))
|
||||
assert(contains_cycle(partial_tree) == None)
|
||||
assert(contains_cycle(multirooted_tree) == None)
|
||||
|
||||
def test_is_nonproj_arc():
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
partial_tree = [1,2,2,4,5,None,7,4,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(is_nonproj_arc(0,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(1,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(2,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(3,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(4,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(5,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(6,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7,nonproj_tree) == True)
|
||||
assert(is_nonproj_arc(8,nonproj_tree) == False)
|
||||
assert(is_nonproj_arc(7,partial_tree) == False)
|
||||
assert(is_nonproj_arc(17,multirooted_tree) == False)
|
||||
assert(is_nonproj_arc(16,multirooted_tree) == True)
|
||||
|
||||
def test_is_nonproj_tree():
|
||||
proj_tree = [1,2,2,4,5,2,7,5,2]
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
partial_tree = [1,2,2,4,5,None,7,4,2]
|
||||
multirooted_tree = [3,2,0,3,3,7,7,3,7,10,7,10,11,12,18,16,18,17,12,3]
|
||||
assert(is_nonproj_tree(proj_tree) == False)
|
||||
assert(is_nonproj_tree(nonproj_tree) == True)
|
||||
assert(is_nonproj_tree(partial_tree) == False)
|
||||
assert(is_nonproj_tree(multirooted_tree) == True)
|
||||
|
||||
|
||||
def deprojectivize(proj_heads, deco_labels, EN):
|
||||
slen = len(proj_heads)
|
||||
sent = EN.tokenizer.tokens_from_list(['whatever'] * slen)
|
||||
rel_proj_heads = [ head-i for i,head in enumerate(proj_heads) ]
|
||||
labelids = [ EN.vocab.strings[label] for label in deco_labels ]
|
||||
pairs = list(zip(rel_proj_heads,labelids))
|
||||
parse = numpy.asarray(pairs, dtype=numpy.int32)
|
||||
sent.from_array([HEAD,DEP],parse)
|
||||
PseudoProjectivity.deprojectivize(sent)
|
||||
parse = sent.to_array([HEAD,DEP])
|
||||
deproj_heads = [ i+head for i,head in enumerate(parse[:,0]) ]
|
||||
undeco_labels = [ EN.vocab.strings[int(labelid)] for labelid in parse[:,1] ]
|
||||
return deproj_heads, undeco_labels
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pseudoprojectivity(EN):
|
||||
tree = [1,2,2]
|
||||
nonproj_tree = [1,2,2,4,5,2,7,4,2]
|
||||
labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct']
|
||||
nonproj_tree2 = [9,1,3,1,5,6,9,8,6,1,6,12,13,10,1]
|
||||
labels2 = ['advmod','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct']
|
||||
|
||||
assert(PseudoProjectivity.decompose('X||Y') == ('X','Y'))
|
||||
assert(PseudoProjectivity.decompose('X') == ('X',''))
|
||||
|
||||
assert(PseudoProjectivity.is_decorated('X||Y') == True)
|
||||
assert(PseudoProjectivity.is_decorated('X') == False)
|
||||
|
||||
PseudoProjectivity._lift(0,tree)
|
||||
assert(tree == [2,2,2])
|
||||
|
||||
np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree)
|
||||
assert(np_arc == 7)
|
||||
|
||||
np_arc = PseudoProjectivity._get_smallest_nonproj_arc(nonproj_tree2)
|
||||
assert(np_arc == 10)
|
||||
|
||||
proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree,labels)
|
||||
assert(proj_heads == [1,2,2,4,5,2,7,5,2])
|
||||
assert(deco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl||dobj','punct'])
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
|
||||
assert(deproj_heads == nonproj_tree)
|
||||
assert(undeco_labels == labels)
|
||||
|
||||
proj_heads, deco_labels = PseudoProjectivity.projectivize(nonproj_tree2,labels2)
|
||||
assert(proj_heads == [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1])
|
||||
assert(deco_labels == ['advmod||aux','root','det','nsubj','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct'])
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
|
||||
assert(deproj_heads == nonproj_tree2)
|
||||
assert(undeco_labels == labels2)
|
||||
|
||||
# if decoration is wrong such that there is no head with the desired label
|
||||
# the structure is kept and the label is undecorated
|
||||
proj_heads = [1,2,2,4,5,2,7,5,2]
|
||||
deco_labels = ['det','nsubj','root','det','dobj','aux','nsubj','acl||iobj','punct']
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
|
||||
assert(deproj_heads == proj_heads)
|
||||
assert(undeco_labels == ['det','nsubj','root','det','dobj','aux','nsubj','acl','punct'])
|
||||
|
||||
# if there are two potential new heads, the first one is chosen even if it's wrong
|
||||
proj_heads = [1,1,3,1,5,6,9,8,6,1,9,12,13,10,1]
|
||||
deco_labels = ['advmod||aux','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod||dobj','advmod','det','amod','punct']
|
||||
deproj_heads, undeco_labels = deprojectivize(proj_heads,deco_labels,EN)
|
||||
assert(deproj_heads == [3,1,3,1,5,6,9,8,6,1,6,12,13,10,1])
|
||||
assert(undeco_labels == ['advmod','root','det','aux','advmod','det','dobj','det','nmod','aux','nmod','advmod','det','amod','punct'])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
0
spacy/tests/print/__init__.py
Normal file
0
spacy/tests/print/__init__.py
Normal file
0
spacy/tests/serialize/__init__.py
Normal file
0
spacy/tests/serialize/__init__.py
Normal file
0
spacy/tests/spans/__init__.py
Normal file
0
spacy/tests/spans/__init__.py
Normal file
0
spacy/tests/tagger/__init__.py
Normal file
0
spacy/tests/tagger/__init__.py
Normal file
0
spacy/tests/tokenizer/__init__.py
Normal file
0
spacy/tests/tokenizer/__init__.py
Normal file
0
spacy/tests/tokens/__init__.py
Normal file
0
spacy/tests/tokens/__init__.py
Normal file
0
spacy/tests/vectors/__init__.py
Normal file
0
spacy/tests/vectors/__init__.py
Normal file
0
spacy/tests/vocab/__init__.py
Normal file
0
spacy/tests/vocab/__init__.py
Normal file
|
@ -1,18 +0,0 @@
|
|||
# -*- coding: utf8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.orth import asciied
|
||||
|
||||
|
||||
def test_tilde():
|
||||
string = u'hõmbre'
|
||||
assert asciied(string) == b'hombre'
|
||||
|
||||
|
||||
def test_smart_quote():
|
||||
string = u'“'
|
||||
assert asciied(string) == b'"'
|
||||
string = u'”'
|
||||
assert asciied(string) == b'"'
|
|
@ -45,6 +45,11 @@ def test_symbols(en_vocab):
|
|||
assert en_vocab.strings['PROB'] == PROB
|
||||
|
||||
|
||||
def test_contains(en_vocab):
|
||||
assert 'Hello' in en_vocab
|
||||
assert 'LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm' not in en_vocab
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_pickle_vocab(en_vocab):
|
||||
file_ = io.BytesIO()
|
||||
|
|
0
spacy/tests/website/__init__.py
Normal file
0
spacy/tests/website/__init__.py
Normal file
|
@ -201,17 +201,9 @@ cdef class Token:
|
|||
cdef int nr_iter = 0
|
||||
cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge)
|
||||
while ptr < self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
||||
ptr += ptr.head
|
||||
|
||||
elif ptr + ptr.head == self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
yield self.doc[ptr - (self.c - self.i)]
|
||||
ptr += 1
|
||||
else:
|
||||
ptr += 1
|
||||
ptr += 1
|
||||
nr_iter += 1
|
||||
# This is ugly, but it's a way to guard out infinite loops
|
||||
if nr_iter >= 10000000:
|
||||
|
@ -226,16 +218,10 @@ cdef class Token:
|
|||
tokens = []
|
||||
cdef int nr_iter = 0
|
||||
while ptr > self.c:
|
||||
# If this head is still to the right of us, we can skip to it
|
||||
# No token that's between this token and this head could be our
|
||||
# child.
|
||||
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
||||
ptr += ptr.head
|
||||
elif ptr + ptr.head == self.c:
|
||||
if ptr + ptr.head == self.c:
|
||||
tokens.append(self.doc[ptr - (self.c - self.i)])
|
||||
ptr -= 1
|
||||
else:
|
||||
ptr -= 1
|
||||
ptr -= 1
|
||||
nr_iter += 1
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError(
|
||||
"Possibly infinite loop encountered while looking for token.rights")
|
||||
|
|
|
@ -22,12 +22,12 @@ def get_package(data_dir):
|
|||
|
||||
def get_package_by_name(name=None, via=None):
|
||||
try:
|
||||
return sputnik.package(about.__name__, about.__version__,
|
||||
return sputnik.package(about.__title__, about.__version__,
|
||||
name or about.__default_model__, data_path=via)
|
||||
except PackageNotFoundException as e:
|
||||
raise RuntimeError("Model not installed. Please run 'python -m "
|
||||
raise RuntimeError("Model %s not installed. Please run 'python -m "
|
||||
"spacy.en.download' to install latest compatible "
|
||||
"model.")
|
||||
"model." % name)
|
||||
except CompatiblePackageNotFoundException as e:
|
||||
raise RuntimeError("Installed model is not compatible with spaCy "
|
||||
"version. Please run 'python -m spacy.en.download "
|
||||
|
|
|
@ -52,7 +52,7 @@ cdef class Vocab:
|
|||
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
|
||||
|
||||
@classmethod
|
||||
def from_package(cls, package, get_lex_attr=None):
|
||||
def from_package(cls, package, get_lex_attr=None, vectors_package=None):
|
||||
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||
|
||||
lemmatizer = Lemmatizer.from_package(package)
|
||||
|
@ -66,7 +66,10 @@ cdef class Vocab:
|
|||
self.strings.load(file_)
|
||||
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
||||
|
||||
if package.has_file('vocab', 'vec.bin'):
|
||||
if vectors_package and vectors_package.has_file('vocab', 'vec.bin'):
|
||||
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||
vectors_package.file_path('vocab', 'vec.bin'))
|
||||
elif package.has_file('vocab', 'vec.bin'):
|
||||
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||
package.file_path('vocab', 'vec.bin'))
|
||||
return self
|
||||
|
@ -106,25 +109,6 @@ cdef class Vocab:
|
|||
"""The current number of lexemes stored."""
|
||||
return self.length
|
||||
|
||||
def __reduce__(self):
|
||||
# TODO: This is hopelessly broken. The state is transferred as just
|
||||
# a temp directory! We then fail to clean this up. This method therefore
|
||||
# only pretends to work. What we need to do is form an archive file.
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
||||
str_loc = path.join(tmp_dir, 'strings.json')
|
||||
vec_loc = path.join(tmp_dir, 'vec.bin')
|
||||
|
||||
self.dump(lex_loc)
|
||||
with io.open(str_loc, 'w', encoding='utf8') as file_:
|
||||
self.strings.dump(file_)
|
||||
|
||||
self.dump_vectors(vec_loc)
|
||||
|
||||
state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr,
|
||||
self.serializer_freqs, self.data_dir)
|
||||
return (unpickle_vocab, state, None, None)
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
|
@ -188,6 +172,11 @@ cdef class Vocab:
|
|||
self._by_orth.set(lex.orth, <void*>lex)
|
||||
self.length += 1
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
key = hash_string(string)
|
||||
lex = self._by_hash.get(key)
|
||||
return True if lex is not NULL else False
|
||||
|
||||
def __iter__(self):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
|
@ -388,27 +377,6 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
|
||||
def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr,
|
||||
serializer_freqs, data_dir):
|
||||
cdef Vocab vocab = Vocab()
|
||||
|
||||
vocab.get_lex_attr = get_lex_attr
|
||||
vocab.morphology = morphology
|
||||
vocab.strings = morphology.strings
|
||||
vocab.data_dir = data_dir
|
||||
vocab.serializer_freqs = serializer_freqs
|
||||
|
||||
with io.open(strings_loc, 'r', encoding='utf8') as file_:
|
||||
vocab.strings.load(file_)
|
||||
vocab.load_lexemes(lex_loc)
|
||||
if vec_loc is not None:
|
||||
vocab.vectors_length = vocab.load_vectors_from_bin_loc(vec_loc)
|
||||
return vocab
|
||||
|
||||
|
||||
copy_reg.constructor(unpickle_vocab)
|
||||
|
||||
|
||||
def write_binary_vectors(in_loc, out_loc):
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
cdef Address mem
|
||||
|
|
13
tox.ini
13
tox.ini
|
@ -1,13 +0,0 @@
|
|||
[tox]
|
||||
envlist =
|
||||
py27
|
||||
py34
|
||||
recreate = True
|
||||
|
||||
[testenv]
|
||||
changedir = {envtmpdir}
|
||||
deps =
|
||||
pytest
|
||||
commands =
|
||||
python -m spacy.en.download
|
||||
python -m pytest {toxinidir}/spacy/ -x --models --vectors --slow
|
|
@ -2,7 +2,7 @@ include ../../header.jade
|
|||
include ./meta.jade
|
||||
|
||||
mixin Displacy(sentence, caption_text, height)
|
||||
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||
- var url = "https://displacy.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src="/resources/displacy/robots.html" height=height)
|
||||
|
@ -20,7 +20,7 @@ mixin Displacy(sentence, caption_text, height)
|
|||
|
||||
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications – translation, sentiment analysis, and summarization are major application areas.
|
||||
|
||||
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="https://spacy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="https://api.spacy.io/displacy") displaCy]:
|
||||
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="https://spacy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="https://displacy.spacy.io/displacy") displaCy]:
|
||||
|
||||
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
|
||||
|
||||
|
@ -40,7 +40,7 @@ mixin Displacy(sentence, caption_text, height)
|
|||
|
||||
p To me, this seemed like witchcraft, or a hack at best. But I was quickly won over: if all we do is declare the data and the relationships, in standards-compliant HTML and CSS, then we can simply step back and let the browser do its job. We know the code will be small, the layout will work on a variety of display, and we'll have a ready separation of style and content. For long output, we simply let the graphic overflow, and let users scroll.
|
||||
|
||||
p What I'm particularly excited about is the potential for displaCy as an #[a(href="https://api.spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
|
||||
p What I'm particularly excited about is the potential for displaCy as an #[a(href="https://displacy.spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
|
||||
|
||||
ul
|
||||
li You're always asked a question. You don't have to decide-what-to-decide.
|
||||
|
|
|
@ -10,7 +10,7 @@ include ./meta.jade
|
|||
|
||||
p It turns out that almost anything we say could mean many many different things, but we don't notice because almost all of those meanings would be weird or stupid or just not possible. If I say:
|
||||
|
||||
p.example #[a(href="https://api.spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
|
||||
p.example #[a(href="https://displacy.spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
|
||||
|
||||
p Would you ever ask me,
|
||||
|
||||
|
@ -18,7 +18,7 @@ include ./meta.jade
|
|||
|
||||
p It's weird to even think of that. But a computer just might, because there are other cases like:
|
||||
|
||||
p.example #[a(href="https://api.spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
|
||||
p.example #[a(href="https://displacy.spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
|
||||
|
||||
p Where the words hang together in the other way. People used to think that the answer was to tell the computer lots and lots of facts. But then you wake up one day and you're writing facts like #[em movies do not wear dresses], and you wonder where it all went wrong. Actually it's even worse than that. Not only are there too many facts, most of them are not even really facts! #[a(href="https://en.wikipedia.org/wiki/Cyc") People really tried this]. We've found that the world is made up of #[em if]s and #[em but]s.
|
||||
|
||||
|
|
|
@ -118,7 +118,7 @@ mixin WritePage(Site, Author, Page)
|
|||
nav(role="navigation")
|
||||
li(class={active: Page.active.home}): a(href="/") Home
|
||||
li(class={active: Page.active.docs}): a(href="/docs") Docs
|
||||
li: a(href="https://api.spacy.io/displacy", target="_blank") Demo
|
||||
li: a(href="https://displacy.spacy.io/displacy", target="_blank") Demo
|
||||
li(class={active: Page.active.blog}): a(href="/blog") Blog
|
||||
main#content
|
||||
block
|
||||
|
|
|
@ -60,7 +60,7 @@ mixin Option(name, open)
|
|||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
| $ curl https://gist.githubusercontent.com/henningpeters/e0c89c0d640b575ce0fb/raw/63c5b8def46629359465816902181d5708dd582b/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
|
||||
+Option("Compile from source", false)
|
||||
|
@ -79,12 +79,11 @@ mixin Option(name, open)
|
|||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ pip install -e .
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test spacy/tests/
|
||||
| $ python -m pytest spacy
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
mixin Displacy(sentence, caption_text, height)
|
||||
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||
- var url = "https://displacy.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src="/resources/displacy/displacy_demo.html" height=height)
|
||||
|
@ -17,6 +17,6 @@ mixin Displacy(sentence, caption_text, height)
|
|||
275
|
||||
)
|
||||
|
||||
p #[a(href="https://api.spacy.io/displacy") displaCy] lets you peek inside spaCy's syntactic parser, as it reads a sentence word-by-word. By repeatedly choosing from a small set of actions, it links the words together according to their syntactic structure. This type of representation powers a wide range of technologies, from translation and summarization, to sentiment analysis and algorithmic trading. #[a(href="/blog/displacy") Read more.]
|
||||
p #[a(href="https://displacy.spacy.io/displacy") displaCy] lets you peek inside spaCy's syntactic parser, as it reads a sentence word-by-word. By repeatedly choosing from a small set of actions, it links the words together according to their syntactic structure. This type of representation powers a wide range of technologies, from translation and summarization, to sentiment analysis and algorithmic trading. #[a(href="/blog/displacy") Read more.]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user