* Merge in changes from de branch

This commit is contained in:
Matthew Honnibal 2015-09-06 19:49:28 +02:00
commit 86c888667f
50 changed files with 2351 additions and 973 deletions

View File

@ -20,6 +20,7 @@ from __future__ import unicode_literals
from ast import literal_eval
import math
import gzip
import json
import plac
from pathlib import Path
@ -29,8 +30,6 @@ from shutil import copytree
import codecs
from collections import defaultdict
from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.strings import hash_string
@ -38,6 +37,13 @@ from preshed.counter import PreshCounter
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.en
import spacy.de
import spacy.fi
import spacy.it
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
@ -139,7 +145,7 @@ def _read_senses(loc):
return lexicon
def setup_vocab(src_dir, dst_dir):
def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
if not dst_dir.exists():
dst_dir.mkdir()
@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else:
print("Warning: Word vectors file not found")
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
if not probs:
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
if not probs:
oov_prob = 0.0
oov_prob = -20
else:
oov_prob = min(probs.values())
for word in clusters:
@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir):
lexicon = []
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
entry = get_lex_props(word)
entry['prob'] = float(prob)
cluster = clusters.get(word, '0')
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2)
vocab[word] = entry
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))
with (dst_dir / 'oov_prob').open('w') as file_:
file_.write('%f' % oov_prob)
def main(lang_data_dir, corpora_dir, model_dir):
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
languages = {
'en': spacy.en.English.default_lex_attrs(),
'de': spacy.de.Deutsch.default_lex_attrs(),
'fi': spacy.fi.Finnish.default_lex_attrs(),
'it': spacy.it.Italian.default_lex_attrs(),
}
model_dir = Path(model_dir)
lang_data_dir = Path(lang_data_dir)
corpora_dir = Path(corpora_dir)
lang_data_dir = Path(lang_data_dir) / lang_id
corpora_dir = Path(corpora_dir) / lang_id
assert corpora_dir.exists()
assert lang_data_dir.exists()
@ -187,13 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir):
if not model_dir.exists():
model_dir.mkdir()
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab')
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')
if (lang_data_dir / 'gazetteer.json').exists():
copyfile(str(lang_data_dir / 'gazetteer.json'),
str(model_dir / 'vocab' / 'gazetteer.json'))
if not (model_dir / 'wordnet').exists():
if (lang_data_dir / 'lemma_rules.json').exists():
copyfile(str(lang_data_dir / 'lemma_rules.json'),
str(model_dir / 'vocab' / 'lemma_rules.json'))
if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))

View File

@ -14,7 +14,6 @@ import re
import spacy.util
from spacy.en import English
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
from spacy.syntax.util import Config
from spacy.gold import read_json_file
@ -22,6 +21,11 @@ from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.ner import BiluoPushDown
from spacy.tagger import Tagger
from spacy.syntax.parser import Parser
def _corrupt(c, noise_level):
if random.random() >= noise_level:
@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
beam_width=1, verbose=False,
use_orig_arc_eager=False):
dep_model_dir = path.join(model_dir, 'deps')
pos_model_dir = path.join(model_dir, 'pos')
ner_model_dir = path.join(model_dir, 'ner')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
if path.exists(ner_model_dir):
shutil.rmtree(ner_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(pos_model_dir)
os.mkdir(ner_model_dir)
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
labels=ArcEager.get_labels(gold_tuples),
beam_width=beam_width)
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
labels=BiluoPushDown.get_labels(gold_tuples),
beam_width=0)
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
nlp = Language(data_dir=model_dir)
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training()
nlp.end_training(model_dir)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):

175
bin/tagger/train.py Executable file
View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
import shutil
import codecs
import random
import plac
import re
import spacy.util
from spacy.en import English
from spacy.tagger import Tagger
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def score_model(scorer, nlp, raw_text, annot_tuples):
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
else:
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold)
def _merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
beam_width=1, verbose=False,
use_orig_arc_eager=False):
if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]
templates = Tagger.default_templates()
nlp = Language(data_dir=model_dir, tagger=False)
nlp.tagger = Tagger.blank(nlp.vocab, templates)
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, ctnt in sents:
words = annot_tuples[1]
gold_tags = annot_tuples[2]
score_model(scorer, nlp, raw_text, annot_tuples)
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(words)
else:
tokens = nlp.tokenizer(raw_text)
loss += nlp.tagger.train(tokens, gold_tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training(model_dir)
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
scorer = Scorer()
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=verbose)
return scorer
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
nlp = Language(data_dir=model_dir)
if beam_width is not None:
nlp.parser.cfg.beam_width = beam_width
gold_tuples = read_json_file(dev_loc)
scorer = Scorer()
out_file = codecs.open(out_loc, 'w', 'utf8')
for raw_text, sents in gold_tuples:
sents = _merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
nlp.tagger(tokens)
nlp.entity(tokens)
nlp.parser(tokens)
else:
tokens = nlp(raw_text, merge_mwes=False)
gold = GoldParse(tokens, annot_tuples)
scorer.score(tokens, gold, verbose=False)
for t in tokens:
out_file.write(
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
)
return scorer
@plac.annotations(
train_loc=("Location of training file or directory"),
dev_loc=("Location of development file or directory"),
model_dir=("Location of output model directory",),
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
corruption_level=("Amount of noise to add to training data", "option", "c", float),
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
out_loc=("Out location", "option", "o", str),
n_sents=("Number of training sentences", "option", "n", int),
n_iter=("Number of training iterations", "option", "i", int),
verbose=("Verbose error reporting", "flag", "v", bool),
debug=("Debug mode", "flag", "d", bool),
)
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
if not eval_only:
gold_train = list(read_json_file(train_loc))
train(English, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose)
#if out_loc:
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':
plac.call(main)

3
lang_data/de/infix.txt Normal file
View File

@ -0,0 +1,3 @@
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zA-Z])-(?=[a-zA-z])

View File

0
lang_data/de/morphs.json Normal file
View File

21
lang_data/de/prefix.txt Normal file
View File

@ -0,0 +1,21 @@
,
"
(
[
{
*
<
$
£
'
``
`
#
US$
C$
A$
a-
....
...

3
lang_data/de/sample.txt Normal file
View File

@ -0,0 +1,3 @@
Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.

149
lang_data/de/specials.json Normal file
View File

@ -0,0 +1,149 @@
{
"a.m.": [{"F": "a.m."}],
"p.m.": [{"F": "p.m."}],
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
"Jan.": [{"F": "Jan.", "L": "Januar"}],
"Feb.": [{"F": "Feb.", "L": "Februar"}],
"Mär.": [{"F": "Mär.", "L": "März"}],
"Apr.": [{"F": "Apr.", "L": "April"}],
"Mai.": [{"F": "Mai.", "L": "Mai"}],
"Jun.": [{"F": "Jun.", "L": "Juni"}],
"Jul.": [{"F": "Jul.", "L": "Juli"}],
"Aug.": [{"F": "Aug.", "L": "August"}],
"Sep.": [{"F": "Sep.", "L": "September"}],
"Sept.": [{"F": "Sept.", "L": "September"}],
"Okt.": [{"F": "Okt.", "L": "Oktober"}],
"Nov.": [{"F": "Nov.", "L": "November"}],
"Dez.": [{"F": "Dez.", "L": "Dezember"}],
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"\")": [{"F": "\")"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
"z.b.": [{"F": "z.b."}],
"e.h.": [{"F": "I.e."}],
"o.ä.": [{"F": "I.E."}],
"bzw.": [{"F": "bzw."}],
"usw.": [{"F": "usw."}],
"\n": [{"F": "\n", "pos": "SP"}],
"\t": [{"F": "\t", "pos": "SP"}],
" ": [{"F": " ", "pos": "SP"}]
}

26
lang_data/de/suffix.txt Normal file
View File

@ -0,0 +1,26 @@
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
''
's
'S
s
S
\.\.
\.\.\.
\.\.\.\.
(?<=[a-z0-9)\]"'%\)])\.
(?<=[0-9])km

56
lang_data/de/tag_map.json Normal file
View File

@ -0,0 +1,56 @@
{
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
"ADJA": {"pos": "ADJ"},
"ADJD": {"pos": "ADJ", "Variant": "Short"},
"ADV": {"pos": "ADV"},
"APPO": {"pos": "ADP", "AdpType": "Post"},
"APPR": {"pos": "ADP", "AdpType": "Prep"},
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
"APZR": {"pos": "ADP", "AdpType": "Circ"},
"ART": {"pos": "DET", "PronType": "Art"},
"CARD": {"pos": "NUM", "NumType": "Card"},
"FM": {"pos": "X", "Foreign": "Yes"},
"ITJ": {"pos": "INTJ"},
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
"KON": {"pos": "CONJ"},
"KOUI": {"pos": "SCONJ"},
"KOUS": {"pos": "SCONJ"},
"NE": {"pos": "PROPN"},
"NN": {"pos": "NOUN"},
"PAV": {"pos": "ADV", "PronType": "Dem"},
"PDAT": {"pos": "DET", "PronType": "Dem"},
"PDS": {"pos": "PRON", "PronType": "Dem"},
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
"PPER": {"pos": "PRON", "PronType": "Prs"},
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
"PRELAT": {"pos": "DET", "PronType": "Rel"},
"PRELS": {"pos": "PRON", "PronType": "Rel"},
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
"PTKA": {"pos": "PART"},
"PTKANT": {"pos": "PART", "PartType": "Res"},
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
"PTKZU": {"pos": "PART", "PartType": "Inf"},
"PWAT": {"pos": "DET", "PronType": "Int"},
"PWAV": {"pos": "ADV", "PronType": "Int"},
"PWS": {"pos": "PRON", "PronType": "Int"},
"TRUNC": {"pos": "X", "Hyph": "Yes"},
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
"XY": {"pos": "X"}
}

View File

@ -0,0 +1,31 @@
{
"noun": [
["s", ""],
["ses", "s"],
["ves", "f"],
["xes", "x"],
["zes", "z"],
["ches", "ch"],
["shes", "sh"],
["men", "man"],
["ies", "y"]
],
"verb": [
["s", ""],
["ies", "y"],
["es", "e"],
["es", ""],
["ed", "e"],
["ed", ""],
["ing", "e"],
["ing", ""]
],
"adj": [
["er", ""],
["est", ""],
["er", "e"],
["est", "e"]
]
}

60
lang_data/en/tag_map.json Normal file
View File

@ -0,0 +1,60 @@
{
".": {"pos": "punct", "puncttype": "peri"},
",": {"pos": "punct", "puncttype": "comm"},
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
":": {"pos": "punct"},
"$": {"pos": "sym", "other": {"symtype": "currency"}},
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
"AFX": {"pos": "adj", "hyph": "hyph"},
"CC": {"pos": "conj", "conjtype": "coor"},
"CD": {"pos": "num", "numtype": "card"},
"DT": {"pos": "adj", "prontype": "prn"},
"EX": {"pos": "adv", "advtype": "ex"},
"FW": {"pos": "x", "foreign": "foreign"},
"HYPH": {"pos": "punct", "puncttype": "dash"},
"IN": {"pos": "adp"},
"JJ": {"pos": "adj", "degree": "pos"},
"JJR": {"pos": "adj", "degree": "comp"},
"JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {"pos": "no_tag"},
"NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
"NNS": {"pos": "noun", "number": "plur"},
"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"},
"POS": {"pos": "part", "poss": "poss"},
"PRP": {"pos": "noun", "prontype": "prs"},
"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"},
"RB": {"pos": "adv", "degree": "pos"},
"RBR": {"pos": "adv", "degree": "comp"},
"RBS": {"pos": "adv", "degree": "sup"},
"RP": {"pos": "part"},
"SYM": {"pos": "sym"},
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
"UH": {"pos": "intJ"},
"VB": {"pos": "verb", "verbform": "inf"},
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"},
"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"},
"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3},
"WDT": {"pos": "adj", "prontype": "int|rel"},
"WP": {"pos": "noun", "prontype": "int|rel"},
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
"WRB": {"pos": "adv", "prontype": "int|rel"},
"SP": {"pos": "space"},
"ADD": {"pos": "x"},
"NFP": {"pos": "punct"},
"GW": {"pos": "x"},
"AFX": {"pos": "x"},
"HYPH": {"pos": "punct"},
"XX": {"pos": "x"},
"BES": {"pos": "verb"},
"HVS": {"pos": "verb"}
}

View File

@ -153,7 +153,7 @@ def main(modules, is_pypy):
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology',
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer', 'spacy.en.attrs',

View File

@ -91,6 +91,8 @@ cdef class Model:
count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts)
def end_training(self):
def end_training(self, model_loc=None):
if model_loc is None:
model_loc = self.model_loc
self._model.end_training()
self._model.dump(self.model_loc, freq_thresh=0)
self._model.dump(model_loc, freq_thresh=0)

View File

@ -84,3 +84,4 @@ cpdef enum attr_id_t:
ENT_TYPE
HEAD
SPACY
PROB

View File

@ -1,181 +1,12 @@
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
from os import path
import re
import struct
import json
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
from ..serialize.bits import BitArray
from ..matcher import Matcher
from ..language import Language
from ..tokens import Doc
from ..multi_words import RegexMerger
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
from . import regexes
from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string, oov_prob=-30, is_oov=False):
return {
'flags': get_flags(string, is_oov=is_oov),
'length': len(string),
'orth': string,
'lower': string.lower(),
'norm': string,
'shape': orth.word_shape(string),
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': oov_prob,
'sentiment': 0
}
if_model_present = -1
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
class English(object):
"""The English NLP pipeline.
Example:
Load data from default directory:
>>> nlp = English()
>>> nlp = English(data_dir=u'')
Load data from specified directory:
>>> nlp = English(data_dir=u'path/to/data_directory')
Disable (and avoid loading) parts of the processing pipeline:
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
Start with nothing loaded:
>>> nlp = English(data_dir=None)
"""
ParserTransitionSystem = ArcEager
EntityTransitionSystem = BiluoPushDown
def __init__(self,
data_dir=LOCAL_DATA_DIR,
Tokenizer=Tokenizer.from_dir,
Tagger=EnPosTagger,
Parser=ParserFactory(ParserTransitionSystem),
Entity=ParserFactory(EntityTransitionSystem),
Matcher=Matcher.from_dir,
Packer=None,
load_vectors=True
):
self.data_dir = data_dir
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read())
else:
oov_prob = None
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors,
pos_tags=POS_TAGS,
oov_prob=oov_prob)
if Tagger is True:
Tagger = EnPosTagger
if Parser is True:
transition_system = self.ParserTransitionSystem
Parser = lambda s, d: parser.Parser(s, d, transition_system)
if Entity is True:
transition_system = self.EntityTransitionSystem
Entity = lambda s, d: parser.Parser(s, d, transition_system)
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
if Tagger and path.exists(path.join(data_dir, 'pos')):
self.tagger = Tagger(self.vocab.strings, data_dir)
else:
self.tagger = None
if Parser and path.exists(path.join(data_dir, 'deps')):
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
else:
self.parser = None
if Entity and path.exists(path.join(data_dir, 'ner')):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
if Matcher:
self.matcher = Matcher(self.vocab, data_dir)
else:
self.matcher = None
if Packer:
self.packer = Packer(self.vocab, data_dir)
else:
self.packer = None
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)])
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.matcher and entity:
self.matcher(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
self.entity(tokens)
if merge_mwes and self.mwe_merger is not None:
self.mwe_merger(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
@property
def tags(self):
"""Deprecated. List of part-of-speech tag names."""
return self.tagger.tag_names
class English(Language):
@classmethod
def default_data_dir(cls):
return LOCAL_DATA_DIR

View File

@ -1,105 +0,0 @@
from __future__ import unicode_literals
from os import path
import codecs
NOUN_RULES = (
('s', ''),
('ses', 's'),
('ves', 'f'),
('xes', 'x'),
('zes', 'z'),
('ches', 'ch'),
('shes', 'sh'),
('men', 'man'),
('ies', 'y')
)
VERB_RULES = (
("s", ""),
("ies", "y"),
("es", "e"),
("es", ""),
("ed", "e"),
("ed", ""),
("ing", "e"),
("ing", "")
)
ADJ_RULES = (
("er", ""),
("est", ""),
("er", "e"),
("est", "e")
)
class Lemmatizer(object):
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
self.noun_id = noun_id
self.verb_id = verb_id
self.adj_id = adj_id
self.index = {}
self.exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
def __call__(self, string, pos):
if pos == self.noun_id:
return self.noun(string)
elif pos == self.verb_id:
return self.verb(string)
elif pos == self.adj_id:
return self.adj(string)
else:
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
def noun(self, string):
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
def verb(self, string):
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
def adj(self, string):
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
forms.extend(exceptions.get(string, []))
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)
def read_index(loc):
index = set()
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(loc):
exceptions = {}
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -1,26 +1,5 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from .._ml cimport Model
from ..strings cimport StringStore
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..parts_of_speech cimport univ_pos_t
from .lemmatizer import Lemmatizer
from ..tagger cimport Tagger
cdef class EnPosTagger:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef readonly Model model
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef public dict freqs
cdef PosTag* tags
cdef readonly object tag_names
cdef readonly object tag_map
cdef readonly int n_tags
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
cdef class EnPosTagger(Tagger):
pass

View File

@ -1,389 +1,11 @@
from os import path
import json
import os
import shutil
from libc.string cimport memset
from ..parts_of_speech cimport NOUN, VERB, ADJ
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer
from ..lemmatizer import Lemmatizer
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (PRON, {}),
'PRP$': (PRON, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
"ADD": (X, {}),
"NFP": (PUNCT, {}),
"GW": (X, {}),
"AFX": (X, {}),
"HYPH": (PUNCT, {}),
"XX": (X, {}),
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"SP": (SPACE, {})
}
POS_TEMPLATES = (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
cdef struct _CachedMorph:
Morphology morph
int lemma
def setup_model_dir(tag_names, tag_map, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
cdef class EnPosTagger:
cdef class EnPosTagger(Tagger):
"""A part-of-speech tagger for English"""
def __init__(self, StringStore strings, data_dir):
self.mem = Pool()
model_dir = path.join(data_dir, 'pos')
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
self.tag_names = sorted(cfg['tag_names'])
assert self.tag_names
self.n_tags = len(self.tag_names)
self.tag_map = cfg['tag_map']
cdef int n_tags = len(self.tag_names) + 1
self.model = Model(n_tags, cfg['templates'], model_dir)
self._morph_cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(self.tag_names)):
pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1
self.freqs[TAG][0] = 1
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
for i in range(tokens.length):
if tokens.data[i].pos == 0:
fill_context(context, i, tokens.data)
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
tokens.data[i].tag = self.strings[tag_strs[i]]
self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
tokens.data)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs):
cdef int i
cdef int loss
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
golds = [self.tag_names.index(g) if g is not None else -1
for g in gold_tag_strs]
correct = 0
for i in range(tokens.length):
fill_context(context, i, tokens.data)
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != golds[i] if golds[i] != -1 else 0
self.model.update(context, guess, golds[i], loss)
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
tokens[i].pos = tag.pos
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.orth
cdef unicode py_string = self.strings[lex.orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma
def load_morph_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef attr_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
orth = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, orth, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0
def make_lemmatizer(self, data_dir):
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

11
spacy/fi/__init__.py Normal file
View File

@ -0,0 +1,11 @@
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language
class Finnish(Language):
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

252
spacy/language.py Normal file
View File

@ -0,0 +1,252 @@
from os import path
try:
import ujson as json
except ImportError:
import json
from .tokenizer import Tokenizer
from .vocab import Vocab
from .syntax.parser import Parser
from .tagger import Tagger
from .matcher import Matcher
from .serialize.packer import Packer
from ._ml import Model
from . import attrs
from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
class Language(object):
@staticmethod
def lower(string):
return string.lower()
@staticmethod
def norm(string):
return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod
def prefix(string):
return string[0]
@staticmethod
def suffix(string):
return string[-3:]
@staticmethod
def prob(string):
return -30
@staticmethod
def cluster(string):
return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod
def is_digit(string):
return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod
def is_space(string):
return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_number(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@classmethod
def default_lex_attrs(cls, data_dir=None):
return {
attrs.LOWER: cls.lower,
attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape,
attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: -10.0,
attrs.IS_ALPHA: cls.is_alpha,
attrs.IS_ASCII: cls.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower,
attrs.IS_PUNCT: cls.is_punct,
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_number,
attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_STOP: lambda string: False,
attrs.IS_OOV: lambda string: True
}
@classmethod
def default_dep_labels(cls):
return {0: {'ROOT': True}}
@classmethod
def default_ner_labels(cls):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')
@classmethod
def default_vectors(cls, data_dir):
return None
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
if data_dir is None:
data_dir = cls.default_data_dir()
if vectors is None:
vectors = cls.default_vectors(data_dir)
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir)
return Vocab.from_dir(
path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr,
vectors=vectors)
@classmethod
def default_tokenizer(cls, vocab, data_dir):
if path.exists(data_dir):
return Tokenizer.from_dir(vocab, data_dir)
else:
return Tokenizer(vocab, {}, None, None, None)
@classmethod
def default_tagger(cls, vocab, data_dir):
if path.exists(data_dir):
return Tagger.from_dir(data_dir, vocab)
else:
return None
@classmethod
def default_parser(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
else:
return None
@classmethod
def default_entity(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
else:
return None
@classmethod
def default_matcher(cls, vocab, data_dir):
if path.exists(data_dir):
return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None):
if data_dir is None:
data_dir = self.default_data_dir()
if vocab is None:
vocab = self.default_vocab(data_dir)
if tokenizer is None:
tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
if tagger is None:
tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
if entity is None:
entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
if parser is None:
parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
if matcher is None:
matcher = self.default_matcher(vocab, data_dir=data_dir)
self.vocab = vocab
self.tokenizer = tokenizer
self.tagger = tagger
self.parser = parser
self.entity = entity
self.matcher = matcher
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.matcher and entity:
self.matcher(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
self.entity(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))

86
spacy/lemmatizer.py Normal file
View File

@ -0,0 +1,86 @@
from __future__ import unicode_literals
from os import path
import codecs
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ
class Lemmatizer(object):
@classmethod
def from_dir(cls, data_dir):
index = {}
exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
self.index = index
self.exc = exceptions
self.rules = rules
def __call__(self, string, pos):
if pos == NOUN:
pos = 'noun'
elif pos == VERB:
pos = 'verb'
elif pos == ADJ:
pos = 'adj'
else:
return string
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
return min(lemmas)
def noun(self, string):
return self(string, 'noun')
def verb(self, string):
return self(string, 'verb')
def adj(self, string):
return self(string, 'adj')
def lemmatize(string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
forms.extend(exceptions.get(string, []))
for old, new in rules:
if string.endswith(old):
form = string[:len(string) - len(old)] + new
if form in index:
forms.append(form)
if not forms:
forms.append(string)
return set(forms)
def read_index(loc):
index = set()
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(loc):
exceptions = {}
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -17,6 +17,7 @@ cdef class Lexeme:
cdef readonly attr_t orth
@staticmethod
<<<<<<< HEAD
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
self.c = lex
@ -41,11 +42,30 @@ cdef class Lexeme:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
=======
cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
lex.length = props['length']
lex.orth = vocab.strings[props['orth']]
lex.lower = vocab.strings[props['lower']]
lex.norm = vocab.strings[props['norm']]
lex.shape = vocab.strings[props['shape']]
lex.prefix = vocab.strings[props['prefix']]
lex.suffix = vocab.strings[props['suffix']]
lex.cluster = props['cluster']
lex.prob = props['prob']
lex.sentiment = props['sentiment']
lex.flags = props['flags']
>>>>>>> de
@staticmethod
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return Lexeme.check_flag(lex, feat_name)
if Lexeme.check_flag(lex, feat_name):
return 1
else:
return 0
elif feat_name == ID:
return lex.id
elif feat_name == ORTH:
@ -67,8 +87,28 @@ cdef class Lexeme:
else:
return 0
@staticmethod
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
if name < (sizeof(flags_t) * 8):
Lexeme.set_flag(lex, name, value)
elif name == ID:
lex.id = value
elif name == LOWER:
lex.lower = value
elif name == NORM:
lex.norm = value
elif name == SHAPE:
lex.shape = value
elif name == PREFIX:
lex.prefix = value
elif name == SUFFIX:
lex.suffix = value
elif name == CLUSTER:
lex.cluster = value
@staticmethod
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
<<<<<<< HEAD
return lexeme.flags & (1 << flag_id)
@staticmethod
@ -78,3 +118,17 @@ cdef class Lexeme:
lexeme.flags |= one << flag_id
else:
lexeme.flags &= ~(one << flag_id)
=======
if lexeme.flags & (1 << flag_id):
return True
else:
return False
@staticmethod
cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
cdef flags_t one = 1
if value:
lex.flags |= one << flag_id
else:
lex.flags &= ~(one << flag_id)
>>>>>>> de

View File

@ -27,6 +27,17 @@ cdef class Lexeme:
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
assert self.c.orth == orth
def py_set_flag(self, attr_id_t flag_id):
Lexeme.set_flag(self.c, flag_id, True)
def py_check_flag(self, attr_id_t flag_id):
return True if Lexeme.check_flag(self.c, flag_id) else False
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]
property lower:
def __get__(self): return self.c.lower
@ -48,9 +59,13 @@ cdef class Lexeme:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property orth_:
def __get__(self):
return self.vocab.strings[self.c.orth]
property cluster:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property prob:
def __get__(self): return self.c.suffix
def __set__(self, int x): self.c.suffix = x
property lower_:
def __get__(self): return self.vocab.strings[self.c.lower]
@ -72,6 +87,10 @@ cdef class Lexeme:
def __get__(self): return self.c.suffix
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
property flags:
def __get__(self): return self.c.flags
def __set__(self, flags_t x): self.c.flags = x
property is_oov:
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)

View File

@ -8,6 +8,7 @@ from cymem.cymem cimport Pool
from libcpp.vector cimport vector
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab
@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
cdef int i
for i in range(pattern.length):
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
print get_token_attr(token, pattern.spec[i].attr)
return False
return True
@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store):
attr = map_attr_name(attr)
if isinstance(value, basestring):
value = string_store[value]
if isinstance(value, bool):
value = int(value)
converted[-1].append((attr, value))
print "Converted", converted[-1]
return converted
@ -92,6 +98,32 @@ def map_attr_name(attr):
return SHAPE
elif attr == 'NORM':
return NORM
elif attr == 'FLAG13':
return FLAG13
elif attr == 'FLAG14':
return FLAG14
elif attr == 'FLAG15':
return FLAG15
elif attr == 'FLAG16':
return FLAG16
elif attr == 'FLAG17':
return FLAG17
elif attr == 'FLAG18':
return FLAG18
elif attr == 'FLAG19':
return FLAG19
elif attr == 'FLAG20':
return FLAG20
elif attr == 'FLAG21':
return FLAG21
elif attr == 'FLAG22':
return FLAG22
elif attr == 'FLAG23':
return FLAG23
elif attr == 'FLAG24':
return FLAG24
elif attr == 'FLAG25':
return FLAG25
else:
raise Exception("TODO: Finish supporting attr mapping %s" % attr)
@ -99,14 +131,28 @@ def map_attr_name(attr):
cdef class Matcher:
cdef Pool mem
cdef vector[Pattern*] patterns
cdef readonly int n_patterns
cdef readonly Vocab vocab
def __init__(self, vocab, patterns):
self.vocab = vocab
self.mem = Pool()
self.vocab = vocab
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add(entity_key, etype, attrs, specs)
@classmethod
def from_dir(cls, data_dir, Vocab vocab):
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
if path.exists(patterns_loc):
patterns_data = open(patterns_loc).read()
patterns = json.loads(patterns_data)
return cls(vocab, patterns)
else:
return cls(vocab, {})
property n_patterns:
def __get__(self): return self.patterns.size()
def add(self, entity_key, etype, attrs, specs):
if isinstance(entity_key, basestring):
entity_key = self.vocab.strings[entity_key]
@ -120,16 +166,6 @@ cdef class Matcher:
spec = _convert_strings(spec, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, spec, etype))
@classmethod
def from_dir(cls, vocab, data_dir):
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
if path.exists(patterns_loc):
patterns_data = open(patterns_loc).read()
patterns = json.loads(patterns_data)
return cls(vocab, patterns)
else:
return cls(vocab, {})
def __call__(self, Doc doc):
cdef vector[Pattern*] partials
cdef int n_partials = 0
@ -139,11 +175,13 @@ cdef class Matcher:
cdef Pattern* state
matches = []
for token_i in range(doc.length):
print 'check', doc[token_i].orth_
token = &doc.data[token_i]
q = 0
for i in range(partials.size()):
state = partials.at(i)
if match(state, token):
print 'match!'
if is_final(state):
matches.append(get_entity(state, token, token_i))
else:
@ -153,6 +191,7 @@ cdef class Matcher:
for i in range(self.n_patterns):
state = self.patterns[i]
if match(state, token):
print 'match!'
if is_final(state):
matches.append(get_entity(state, token, token_i))
else:

View File

@ -1,4 +1,755 @@
from .structs cimport TokenC, Morphology, PosTag
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from libc.stdint cimport uint64_t
from .structs cimport TokenC
from .strings cimport StringStore
from .typedefs cimport attr_t
from .parts_of_speech cimport univ_pos_t
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
cdef struct RichTagC:
uint64_t morph
int id
univ_pos_t pos
attr_t name
cdef struct MorphAnalysisC:
RichTagC tag
attr_t lemma
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef public object lemmatizer
cdef public object n_tags
cdef public object reverse_index
cdef public object tag_names
cdef RichTagC* rich_tags
cdef PreshMapArray _cache
cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
#
#cpdef enum Feature_t:
# Abbr
# AdpType
# AdvType
# ConjType
# Connegative
# Derivation
# Echo
# Foreign
# Gender_dat
# Gender_erg
# Gender_psor
# Hyph
# InfForm
# NameType
# NounType
# NumberAbs
# NumberDat
# NumberErg
# NumberPsee
# NumberPsor
# NumForm
# NumValue
# PartForm
# PartType
# Person_abs
# Person_dat
# Person_psor
# Polite
# Polite_abs
# Polite_dat
# Prefix
# PrepCase
# PunctSide
# PunctType
# Style
# Typo
# Variant
# VerbType
#
#
#cpdef enum Animacy:
# Anim
# Inam
#
#
#cpdef enum Aspect:
# Freq
# Imp
# Mod
# None_
# Perf
#
#
#cpdef enum Case1:
# Nom
# Gen
# Acc
# Dat
# Voc
# Abl
#
#cdef enum Case2:
# Abe
# Abs
# Ade
# All
# Cau
# Com
# Del
# Dis
#
#cdef enum Case3:
# Ela
# Ess
# Ill
# Ine
# Ins
# Loc
# Lat
# Par
#
#cdef enum Case4:
# Sub
# Sup
# Tem
# Ter
# Tra
#
#
#cpdef enum Definite:
# Two
# Def
# Red
# Ind
#
#
#cpdef enum Degree:
# Cmp
# Comp
# None_
# Pos
# Sup
# Abs
# Com
# Degree # du
#
#
#cpdef enum Gender:
# Com
# Fem
# Masc
# Neut
#
#
#cpdef enum Mood:
# Cnd
# Imp
# Ind
# N
# Pot
# Sub
# Opt
#
#
#cpdef enum Negative:
# Neg
# Pos
# Yes
#
#
#cpdef enum Number:
# Com
# Dual
# None_
# Plur
# Sing
# Ptan # bg
# Count # bg
#
#
#cpdef enum NumType:
# Card
# Dist
# Frac
# Gen
# Mult
# None_
# Ord
# Sets
#
#
#cpdef enum Person:
# One
# Two
# Three
# None_
#
#
#cpdef enum Poss:
# Yes
#
#
#cpdef enum PronType1:
# AdvPart
# Art
# Default
# Dem
# Ind
# Int
# Neg
#
#cpdef enum PronType2:
# Prs
# Rcp
# Rel
# Tot
# Clit
# Exc # es, ca, it, fa
# Clit # it
#
#
#cpdef enum Reflex:
# Yes
#
#
#cpdef enum Tense:
# Fut
# Imp
# Past
# Pres
#
#cpdef enum VerbForm1:
# Fin
# Ger
# Inf
# None_
# Part
# PartFut
# PartPast
#
#cpdef enum VerbForm2:
# PartPres
# Sup
# Trans
# Gdv # la
#
#
#cpdef enum Voice:
# Act
# Cau
# Pass
# Mid # gkc
# Int # hb
#
#
#cpdef enum Abbr:
# Yes # cz, fi, sl, U
#
#cpdef enum AdpType:
# Prep # cz, U
# Post # U
# Voc # cz
# Comprep # cz
# Circ # U
# Voc # U
#
#
#cpdef enum AdvType1:
# # U
# Man
# Loc
# Tim
# Deg
# Cau
# Mod
# Sta
# Ex
#
#cpdef enum AdvType2:
# Adadj
#
#cpdef enum ConjType:
# Oper # cz, U
# Comp # cz, U
#
#cpdef enum Connegative:
# Yes # fi
#
#
#cpdef enum Derivation1:
# Minen # fi
# Sti # fi
# Inen # fi
# Lainen # fi
# Ja # fi
# Ton # fi
# Vs # fi
# Ttain # fi
#
#cpdef enum Derivation2:
# Ttaa
#
#
#cpdef enum Echo:
# Rdp # U
# Ech # U
#
#
#cpdef enum Foreign:
# Foreign # cz, fi, U
# Fscript # cz, fi, U
# Tscript # cz, U
# Yes # sl
#
#
#cpdef enum Gender_dat:
# Masc # bq, U
# Fem # bq, U
#
#
#cpdef enum Gender_erg:
# Masc # bq
# Fem # bq
#
#
#cpdef enum Gender_psor:
# Masc # cz, sl, U
# Fem # cz, sl, U
# Neut # sl
#
#
#cpdef enum Hyph:
# Yes # cz, U
#
#
#cpdef enum InfForm:
# One # fi
# Two # fi
# Three # fi
#
#
#cpdef enum NameType:
# Geo # U, cz
# Prs # U, cz
# Giv # U, cz
# Sur # U, cz
# Nat # U, cz
# Com # U, cz
# Pro # U, cz
# Oth # U, cz
#
#
#cpdef enum NounType:
# Com # U
# Prop # U
# Class # U
#
#cpdef enum Number_abs:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_dat:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_erg:
# Sing # bq, U
# Plur # bq, U
#
#cpdef enum Number_psee:
# Sing # U
# Plur # U
#
#
#cpdef enum Number_psor:
# Sing # cz, fi, sl, U
# Plur # cz, fi, sl, U
#
#
#cpdef enum NumForm:
# Digit # cz, sl, U
# Roman # cz, sl, U
# Word # cz, sl, U
#
#
#cpdef enum NumValue:
# One # cz, U
# Two # cz, U
# Three # cz, U
#
#
#cpdef enum PartForm:
# Pres # fi
# Past # fi
# Agt # fi
# Neg # fi
#
#
#cpdef enum PartType:
# Mod # U
# Emp # U
# Res # U
# Inf # U
# Vbp # U
#
#cpdef enum Person_abs:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_dat:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_erg:
# One # bq, U
# Two # bq, U
# Three # bq, U
#
#
#cpdef enum Person_psor:
# One # fi, U
# Two # fi, U
# Three # fi, U
#
#
#cpdef enum Polite:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_abs:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_erg:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Polite_dat:
# Inf # bq, U
# Pol # bq, U
#
#
#cpdef enum Prefix:
# Yes # U
#
#
#cpdef enum PrepCase:
# Npr # cz
# Pre # U
#
#
#cpdef enum PunctSide:
# Ini # U
# Fin # U
#
#cpdef enum PunctType1:
# Peri # U
# Qest # U
# Excl # U
# Quot # U
# Brck # U
# Comm # U
# Colo # U
# Semi # U
#
#cpdef enum PunctType2:
# Dash # U
#
#
#cpdef enum Style1:
# Arch # cz, fi, U
# Rare # cz, fi, U
# Poet # cz, U
# Norm # cz, U
# Coll # cz, U
# Vrnc # cz, U
# Sing # cz, U
# Expr # cz, U
#
#
#cpdef enum Style2:
# Derg # cz, U
# Vulg # cz, U
#
#
#cpdef enum Typo:
# Yes # fi, U
#
#
#cpdef enum Variant:
# Short # cz
# Bound # cz, sl
#
#
#cpdef enum VerbType:
# Aux # U
# Cop # U
# Mod # U
# Light # U
#
cpdef enum Value_t:
Animacy_Anim
Animacy_Inam
Aspect_Freq
Aspect_Imp
Aspect_Mod
Aspect_None_
Aspect_Perf
Case_Abe
Case_Abl
Case_Abs
Case_Acc
Case_Ade
Case_All
Case_Cau
Case_Com
Case_Dat
Case_Del
Case_Dis
Case_Ela
Case_Ess
Case_Gen
Case_Ill
Case_Ine
Case_Ins
Case_Loc
Case_Lat
Case_Nom
Case_Par
Case_Sub
Case_Sup
Case_Tem
Case_Ter
Case_Tra
Case_Voc
Definite_Two
Definite_Def
Definite_Red
Definite_Ind
Degree_Cmp
Degree_Comp
Degree_None
Degree_Pos
Degree_Sup
Degree_Abs
Degree_Com
Degree_Dim # du
Gender_Com
Gender_Fem
Gender_Masc
Gender_Neut
Mood_Cnd
Mood_Imp
Mood_Ind
Mood_N
Mood_Pot
Mood_Sub
Mood_Opt
Negative_Neg
Negative_Pos
Negative_Yes
Number_Com
Number_Dual
Number_None
Number_Plur
Number_Sing
Number_Ptan # bg
Number_Count # bg
NumType_Card
NumType_Dist
NumType_Frac
NumType_Gen
NumType_Mult
NumType_None
NumType_Ord
NumType_Sets
Person_One
Person_Two
Person_Three
Person_None
Poss_Yes
PronType_AdvPart
PronType_Art
PronType_Default
PronType_Dem
PronType_Ind
PronType_Int
PronType_Neg
PronType_Prs
PronType_Rcp
PronType_Rel
PronType_Tot
PronType_Clit
PronType_Exc # es, ca, it, fa
Reflex_Yes
Tense_Fut
Tense_Imp
Tense_Past
Tense_Pres
VerbForm_Fin
VerbForm_Ger
VerbForm_Inf
VerbForm_None
VerbForm_Part
VerbForm_PartFut
VerbForm_PartPast
VerbForm_PartPres
VerbForm_Sup
VerbForm_Trans
VerbForm_Gdv # la
Voice_Act
Voice_Cau
Voice_Pass
Voice_Mid # gkc
Voice_Int # hb
Abbr_Yes # cz, fi, sl, U
AdpType_Prep # cz, U
AdpType_Post # U
AdpType_Voc # cz
AdpType_Comprep # cz
AdpType_Circ # U
AdvType_Man
AdvType_Loc
AdvType_Tim
AdvType_Deg
AdvType_Cau
AdvType_Mod
AdvType_Sta
AdvType_Ex
AdvType_Adadj
ConjType_Oper # cz, U
ConjType_Comp # cz, U
Connegative_Yes # fi
Derivation_Minen # fi
Derivation_Sti # fi
Derivation_Inen # fi
Derivation_Lainen # fi
Derivation_Ja # fi
Derivation_Ton # fi
Derivation_Vs # fi
Derivation_Ttain # fi
Derivation_Ttaa # fi
Echo_Rdp # U
Echo_Ech # U
Foreign_Foreign # cz, fi, U
Foreign_Fscript # cz, fi, U
Foreign_Tscript # cz, U
Foreign_Yes # sl
Gender_dat_Masc # bq, U
Gender_dat_Fem # bq, U
Gender_erg_Masc # bq
Gender_erg_Fem # bq
Gender_psor_Masc # cz, sl, U
Gender_psor_Fem # cz, sl, U
Gender_psor_Neut # sl
Hyph_Yes # cz, U
InfForm_One # fi
InfForm_Two # fi
InfForm_Three # fi
NameType_Geo # U, cz
NameType_Prs # U, cz
NameType_Giv # U, cz
NameType_Sur # U, cz
NameType_Nat # U, cz
NameType_Com # U, cz
NameType_Pro # U, cz
NameType_Oth # U, cz
NounType_Com # U
NounType_Prop # U
NounType_Class # U
Number_abs_Sing # bq, U
Number_abs_Plur # bq, U
Number_dat_Sing # bq, U
Number_dat_Plur # bq, U
Number_erg_Sing # bq, U
Number_erg_Plur # bq, U
Number_psee_Sing # U
Number_psee_Plur # U
Number_psor_Sing # cz, fi, sl, U
Number_psor_Plur # cz, fi, sl, U
NumForm_Digit # cz, sl, U
NumForm_Roman # cz, sl, U
NumForm_Word # cz, sl, U
NumValue_One # cz, U
NumValue_Two # cz, U
NumValue_Three # cz, U
PartForm_Pres # fi
PartForm_Past # fi
PartForm_Agt # fi
PartForm_Neg # fi
PartType_Mod # U
PartType_Emp # U
PartType_Res # U
PartType_Inf # U
PartType_Vbp # U
Person_abs_One # bq, U
Person_abs_Two # bq, U
Person_abs_Three # bq, U
Person_dat_One # bq, U
Person_dat_Two # bq, U
Person_dat_Three # bq, U
Person_erg_One # bq, U
Person_erg_Two # bq, U
Person_erg_Three # bq, U
Person_psor_One # fi, U
Person_psor_Two # fi, U
Person_psor_Three # fi, U
Polite_Inf # bq, U
Polite_Pol # bq, U
Polite_abs_Inf # bq, U
Polite_abs_Pol # bq, U
Polite_erg_Inf # bq, U
Polite_erg_Pol # bq, U
Polite_dat_Inf # bq, U
Polite_dat_Pol # bq, U
Prefix_Yes # U
PrepCase_Npr # cz
PrepCase_Pre # U
PunctSide_Ini # U
PunctSide_Fin # U
PunctType_Peri # U
PunctType_Qest # U
PunctType_Excl # U
PunctType_Quot # U
PunctType_Brck # U
PunctType_Comm # U
PunctType_Colo # U
PunctType_Semi # U
PunctType_Dash # U
Style_Arch # cz, fi, U
Style_Rare # cz, fi, U
Style_Poet # cz, U
Style_Norm # cz, U
Style_Coll # cz, U
Style_Vrnc # cz, U
Style_Sing # cz, U
Style_Expr # cz, U
Style_Derg # cz, U
Style_Vulg # cz, U
Style_Yes # fi, U
StyleVariant_StyleShort # cz
StyleVariant_StyleBound # cz, sl
VerbType_Aux # U
VerbType_Cop # U
VerbType_Mod # U
VerbType_Light # U

View File

@ -1,11 +1,89 @@
# cython: embedsignature=True
from os import path
from .lemmatizer import Lemmatizer
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import UNIV_POS_NAMES
from .parts_of_speech cimport ADJ, VERB, NOUN
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)
cdef class Morphology:
def __init__(self, StringStore string_store, tag_map, lemmatizer):
self.mem = Pool()
self.strings = string_store
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) + 1
self.tag_names = tuple(sorted(tag_map.keys()))
self.reverse_index = {}
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].morph = 0
self.reverse_index[self.rich_tags[i].name] = i
self._cache = PreshMapArray(self.n_tags)
cdef int assign_tag(self, TokenC* token, tag) except -1:
cdef int tag_id
if isinstance(tag, basestring):
try:
tag_id = self.reverse_index[self.strings[tag]]
except KeyError:
print tag
raise
else:
tag_id = tag
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
token.lemma = analysis.lemma
token.pos = analysis.tag.pos
token.tag = analysis.tag.name
token.morph = analysis.tag.morph
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
pass
def load_morph_exceptions(self, dict exc):
# Map (form, pos) to (lemma, rich tag)
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef attr_t orth
cdef int pos
for tag_str, entries in exc.items():
tag = self.strings[tag_str]
rich_tag = self.rich_tags[self.reverse_index[tag]]
for form_str, props in entries.items():
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
orth = self.strings[form_str]
for name_str, value_str in props.items():
if name_str == 'L':
cached.lemma = self.strings[value_str]
else:
self.assign_feature(&cached.tag.morph, name_str, value_str)
if cached.lemma == 0:
cached.lemma = self.lemmatize(rich_tag.pos, orth)
self._cache.set(rich_tag.pos, orth, <void*>cached)
def lemmatize(self, const univ_pos_t pos, attr_t orth):
if self.lemmatizer is None:
return orth
cdef unicode py_string = self.strings[orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings[lemma_string]
return lemma

View File

@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
cpdef bint like_url(unicode string):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if string.startswith('http://'):
if string.startswith('http://') or string.startswith('https://'):
return True
elif string.startswith('www.') and len(string) >= 5:
return True
@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
return False
# TODO: This should live in the language.orth
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
'eighteen nineteen twenty thirty forty fifty sixty seventy'

View File

@ -2,17 +2,22 @@
cpdef enum univ_pos_t:
NO_TAG
ADJ
ADV
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PRT
PROPN
PUNCT
SCONJ
SYM
VERB
X
PUNCT
EOL
SPACE
N_UNIV_TAGS

View File

@ -4,17 +4,22 @@ from __future__ import unicode_literals
UNIV_POS_NAMES = {
"NO_TAG": NO_TAG,
"ADJ": ADJ,
"ADV": ADV,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PRT": PRT,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"SPACE": SPACE,
"EOL": EOL
"EOL": EOL,
"SPACE": SPACE
}

View File

@ -142,6 +142,8 @@ cdef class StringStore:
def load(self, loc):
with codecs.open(loc, 'r', 'utf8') as file_:
strings = file_.read().split(SEPARATOR)
if strings == ['']:
return None
cdef unicode string
cdef bytes byte_string
for string in strings:

View File

@ -1,4 +1,4 @@
from libc.stdint cimport uint8_t, uint32_t, int32_t
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t
@ -26,22 +26,6 @@ cdef struct LexemeC:
float l2_norm
cdef struct Morphology:
uint8_t number
uint8_t tenspect # Tense/aspect/voice
uint8_t mood
uint8_t gender
uint8_t person
uint8_t case
uint8_t misc
cdef struct PosTag:
Morphology morph
int id
univ_pos_t pos
cdef struct Entity:
int start
int end
@ -59,8 +43,8 @@ cdef struct Constituent:
cdef struct TokenC:
const LexemeC* lex
Morphology morph
const Constituent* ctnt
uint64_t morph
univ_pos_t pos
bint spacy
int tag

View File

@ -11,7 +11,6 @@ from .stateclass cimport StateClass
cdef class Parser:
cdef readonly object cfg
cdef readonly Model model
cdef readonly TransitionSystem moves

View File

@ -67,16 +67,22 @@ def ParserFactory(transition_system):
cdef class Parser:
def __init__(self, StringStore strings, model_dir, transition_system):
def __init__(self, StringStore strings, transition_system, model):
self.moves = transition_system
self.model = model
@classmethod
def from_dir(cls, model_dir, strings, transition_system):
if not os.path.exists(model_dir):
print >> sys.stderr, "Warning: No model found at", model_dir
elif not os.path.isdir(model_dir):
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
else:
self.cfg = Config.read(model_dir, 'config')
self.moves = transition_system(strings, self.cfg.labels)
templates = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, templates, model_dir)
cfg = Config.read(model_dir, 'config')
moves = transition_system(strings, cfg.labels)
templates = get_templates(cfg.features)
model = Model(moves.n_moves, templates, model_dir)
return cls(strings, moves, model)
def __call__(self, Doc tokens):
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)

12
spacy/tagger.pxd Normal file
View File

@ -0,0 +1,12 @@
from ._ml cimport Model
from .structs cimport TokenC
from .vocab cimport Vocab
cdef class Tagger:
cdef readonly Vocab vocab
cdef readonly Model model
cdef public dict freqs
cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1

220
spacy/tagger.pyx Normal file
View File

@ -0,0 +1,220 @@
import json
from os import path
from collections import defaultdict
from thinc.typedefs cimport atom_t, weight_t
from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from ._ml cimport arg_max
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
cdef class Tagger:
"""A part-of-speech tagger for English"""
@classmethod
def read_config(cls, data_dir):
return json.load(open(path.join(data_dir, 'pos', 'config.json')))
@classmethod
def default_templates(cls):
return (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
@classmethod
def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
return cls(vocab, model)
@classmethod
def from_dir(cls, data_dir, vocab):
if path.exists(path.join(data_dir, 'templates.json')):
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
self.vocab = vocab
self.model = model
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return self.vocab.morphology.tag_names
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
cdef int i
cdef const weight_t* scores
for i in range(tokens.length):
if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
cdef int i
cdef int loss
cdef const weight_t* scores
try:
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
except ValueError:
raise ValueError(
[g for g in gold_tag_strs if g is not None and g not in self.tag_names])
correct = 0
for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int predict(self, int i, const TokenC* tokens) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
return arg_max(scores, self.model.n_classes)
cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != gold if gold != -1 else 0
self.model.update(context, guess, gold, loss)
return guess
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0

View File

@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology
from .structs cimport LexemeC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
from .vocab cimport Vocab, LexemesOrTokens, _Cached
cdef class Tokenizer:

View File

@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from .morphology cimport set_morph_from_dict
from .strings cimport hash_string
cimport cython
@ -29,7 +28,7 @@ cdef class Tokenizer:
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules, self.vocab.pos_tags)
self._load_special_tokenization(rules)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
@ -193,9 +192,7 @@ cdef class Tokenizer:
tokens.push_back(prefixes[0][i], False)
if string:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
else:
if not cache_hit:
match = self.find_infix(string)
if match is None:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -242,7 +239,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map):
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''
cdef int i
@ -253,29 +250,11 @@ cdef class Tokenizer:
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(rules.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
lemma = props.get("L", None)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
if lemma is not None:
tokens[i].lemma = self.vocab.strings[lemma]
else:
tokens[i].lemma = 0
if 'pos' in props:
tokens[i].tag = self.vocab.strings[props['pos']]
tokens[i].pos = tag_map[props['pos']][0]
# These are defaults, which can be over-ridden by the
# token-specific props.
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
if tokens[i].lemma == 0:
tokens[i].lemma = tokens[i].lex.orth
set_morph_from_dict(&tokens[i].morph, props)
for chunk, substrings in sorted(special_cases.items()):
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
cached.data.tokens = self.vocab.make_fused_token(substrings)
key = hash_string(chunk)
self._specials.set(key, cached)
self._cache.set(key, cached)

View File

@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef const TokenC* const_TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
const_TokenC_ptr
cdef class Doc:

View File

@ -14,6 +14,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech import UNIV_POS_NAMES
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme
from .spans cimport Span
from .token cimport Token
from ..serialize.bits cimport BitArray
@ -210,7 +211,7 @@ cdef class Doc:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
if LexemeOrToken is TokenC_ptr:
if LexemeOrToken is const_TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
@ -218,6 +219,7 @@ cdef class Doc:
t.idx = 0
else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
assert t.lex.orth != 0
t.spacy = has_space
self.length += 1
self._py_tokens.append(None)

View File

@ -1,7 +1,7 @@
from __future__ import unicode_literals
from collections import defaultdict
from ..structs cimport Morphology, TokenC, LexemeC
from ..structs cimport TokenC, LexemeC
from ..typedefs cimport flags_t, attr_t
from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t

View File

@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created

View File

@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, attr_t, hash_t
from .strings cimport StringStore
from .morphology cimport Morphology
cdef LexemeC EMPTY_LEXEME
@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens:
const LexemeC* const* lexemes
TokenC* tokens
const TokenC* tokens
cdef struct _Cached:
@ -27,15 +28,18 @@ cdef class Vocab:
cpdef public lexeme_props_getter
cdef Pool mem
cpdef readonly StringStore strings
cdef readonly object pos_tags
cpdef readonly Morphology morphology
cdef readonly int length
cdef public object _serializer
cdef public object data_dir
cdef public float oov_prob
cdef public object get_lex_attr
cdef public object pos_tags
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL

View File

@ -17,10 +17,12 @@ from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
from .attrs cimport PROB
DEF MAX_VEC_SIZE = 100000
@ -35,30 +37,31 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
self.strings = StringStore()
#self.pos_tags = pos_tags if pos_tags is not None else {}
self.pos_tags = {}
self.get_lex_attr = get_lex_attr
self.repvec_length = 0
self.length = 0
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
if data_dir is not None:
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
self.length = 1
self._serializer = None
@classmethod
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.load_lexemes(path.join(data_dir, 'strings.txt'),
path.join(data_dir, 'lexemes.bin'))
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self._serializer = None
self.data_dir = data_dir
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
return self
property serializer:
def __get__(self):
@ -84,7 +87,9 @@ cdef class Vocab:
cdef LexemeC* lex
cdef hash_t key = hash_string(string)
lex = <LexemeC*>self._by_hash.get(key)
cdef size_t addr
if lex != NULL:
assert lex.orth == self.strings[string]
return lex
else:
return self._new_lexeme(mem, string)
@ -103,16 +108,29 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key
cdef bint is_oov = mem is not self.mem
mem = self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
for attr, func in self.lex_attr_getters.items():
Lexeme.set_struct_attr(lex, attr, func(string))
lex.orth = self.strings[string]
lex.length = len(string)
lex.id = self.length
if self.get_lex_attr is not None:
for attr, func in self.get_lex_attr.items():
value = func(string)
if isinstance(value, unicode):
value = self.strings[value]
if attr == PROB:
lex.prob = value
else:
Lexeme.set_struct_attr(lex, attr, value)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
key = hash_string(string)
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
return lex
@ -125,7 +143,7 @@ cdef class Vocab:
cdef attr_t orth
cdef size_t addr
for orth, addr in self._by_orth.items():
yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
yield Lexeme(self, orth)
def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
@ -142,22 +160,28 @@ cdef class Vocab:
An instance of the Lexeme Python class, with data copied on
instantiation.
'''
cdef const LexemeC* lexeme
cdef attr_t orth
if type(id_or_string) == int:
orth = id_or_string
lexeme = <LexemeC*>self._by_orth.get(orth)
if lexeme == NULL:
raise KeyError(id_or_string)
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
elif type(id_or_string) == unicode:
lexeme = self.get(self.mem, id_or_string)
assert lexeme.orth == self.strings[id_or_string]
if type(id_or_string) == unicode:
orth = self.strings[id_or_string]
else:
raise ValueError("Vocab unable to map type: "
"%s. Maps unicode --> Lexeme or "
"int --> Lexeme" % str(type(id_or_string)))
return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
orth = id_or_string
return Lexeme(self, orth)
cdef const TokenC* make_fused_token(self, substrings) except NULL:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
token = &tokens[i]
# Set the special tokens up to have morphology and lemmas if
# specified, otherwise use the part-of-speech tag (if specified)
token.lex = <LexemeC*>self.get(self.mem, props['F'])
if 'pos' in props:
self.morphology.assign_tag(token, props['pos'])
if 'L' in props:
tokens[i].lemma = self.strings[props['L']]
for feature, value in props.get('morph', {}).items():
self.morphology.assign_feature(&token.morph, feature, value)
return tokens
def dump(self, loc):
if path.exists(loc):

View File

@ -1,6 +1,7 @@
import pytest
@pytest.mark.models
def test_initial(EN):
doc = EN.tokenizer(u'I ate the pizza with anchovies.')
EN.tagger(doc)

View File

@ -41,25 +41,10 @@ def test_attribute():
def test_vocab_codec():
def get_lex_props(string, prob):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
vocab = Vocab()
vocab['dog'] = get_lex_props('dog', 0.001)
vocab['the'] = get_lex_props('the', 0.05)
vocab['jumped'] = get_lex_props('jumped', 0.005)
lex = vocab['dog']
lex = vocab['the']
lex = vocab['jumped']
codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab])

View File

@ -5,6 +5,7 @@ import re
import pytest
import numpy
from spacy.language import Language
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
def get_lex_props(string, prob=-22, is_oov=False):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
@pytest.fixture
def vocab():
vocab = Vocab(get_lex_props=get_lex_props)
vocab['dog'] = get_lex_props('dog', 0.001)
vocab = Vocab(Language.default_lex_attrs())
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
vocab['the'] = get_lex_props('the', 0.01)
vocab['quick'] = get_lex_props('quick', 0.005)
vocab['jumped'] = get_lex_props('jumped', 0.007)
lex = vocab['the']
lex = vocab['quick']
lex = vocab['jumped']
return vocab

View File

@ -14,6 +14,7 @@ def tagged(EN):
tokens = EN(string, tag=True, parse=False)
return tokens
@pytest.mark.models
def test_spaces(tagged):
assert tagged[0].pos != SPACE
assert tagged[0].pos_ != 'SPACE'

View File

@ -1,80 +1,81 @@
# -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
import pytest
@pytest.mark.models
def test_1():
import spacy.en
from spacy.parts_of_speech import ADV
# Load the pipeline, and call it with some text.
nlp = spacy.en.English()
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.",
tag=True, parse=False)
o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
assert u"Give it BACK, he pleaded ABJECTLY, its mine."
o = nlp.vocab[u'back'].prob
assert o == -7.033305644989014
o = nlp.vocab[u'not'].prob
assert o == -5.332601070404053
o = nlp.vocab[u'quietly'].prob
assert o == -11.994928359985352
@pytest.mark.models
def test2():
import spacy.en
from spacy.parts_of_speech import ADV
nlp = spacy.en.English()
# Find log probability of Nth most frequent word
probs = [lex.prob for lex in nlp.vocab]
probs.sort()
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
o == u'Give it back, he pleaded ABJECTLY, its mine.'
@pytest.mark.models
def test3():
import spacy.en
from spacy.parts_of_speech import ADV
nlp = spacy.en.English()
# Find log probability of Nth most frequent word
probs = [lex.prob for lex in nlp.vocab]
probs.sort()
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
assert o == u'Give it back, he pleaded ABJECTLY, its mine.'
pleaded = tokens[7]
assert pleaded.repvec.shape == (300,)
o = pleaded.repvec[:5]
assert sum(o) != 0
from numpy import dot
from numpy.linalg import norm
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
words.reverse()
o = [w.orth_ for w in words[0:20]]
assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
u'countersued', u'remonstrated', u'begged', u'apologised',
u'consented', u'acquiesced', u'petitioned', u'quarreled',
u'appealed', u'pleading']
o = [w.orth_ for w in words[50:60]]
assert o == [u'martialed', u'counselled', u'bragged',
u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
u'dissented', u'yearned']
o = [w.orth_ for w in words[100:110]]
assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
u'clerked']
#o = [w.orth_ for w in words[1000:1010]]
#assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
# u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
#o = [w.orth_ for w in words[50000:50010]]
#assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
# u'dirty', u'rims', u'artists']
#@pytest.mark.models
#def test_1():
# import spacy.en
# from spacy.parts_of_speech import ADV
# # Load the pipeline, and call it with some text.
# nlp = spacy.en.English()
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.",
# tag=True, parse=False)
# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
# assert u"Give it BACK, he pleaded ABJECTLY, its mine."
#
# o = nlp.vocab[u'back'].prob
# assert o == -7.033305644989014
# o = nlp.vocab[u'not'].prob
# assert o == -5.332601070404053
# o = nlp.vocab[u'quietly'].prob
# assert o == -11.994928359985352
#
#
#@pytest.mark.m
#def test2():
# import spacy.en
# from spacy.parts_of_speech import ADV
# nlp = spacy.en.English()
# # Find log probability of Nth most frequent word
# probs = [lex.prob for lex in nlp.vocab]
# probs.sort()
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
# o == u'Give it back, he pleaded ABJECTLY, its mine.'
#
#@pytest.mark.models
#def test3():
# import spacy.en
# from spacy.parts_of_speech import ADV
# nlp = spacy.en.English()
# # Find log probability of Nth most frequent word
# probs = [lex.prob for lex in nlp.vocab]
# probs.sort()
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
# tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
# assert o == u'Give it back, he pleaded ABJECTLY, its mine.'
#
# pleaded = tokens[7]
# assert pleaded.repvec.shape == (300,)
# o = pleaded.repvec[:5]
# assert sum(o) != 0
# from numpy import dot
# from numpy.linalg import norm
#
# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
# words.reverse()
# o = [w.orth_ for w in words[0:20]]
# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
# u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
# u'countersued', u'remonstrated', u'begged', u'apologised',
# u'consented', u'acquiesced', u'petitioned', u'quarreled',
# u'appealed', u'pleading']
# o = [w.orth_ for w in words[50:60]]
# assert o == [u'martialed', u'counselled', u'bragged',
# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
# u'dissented', u'yearned']
# o = [w.orth_ for w in words[100:110]]
# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
# u'clerked']
#
# #o = [w.orth_ for w in words[1000:1010]]
# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
# #o = [w.orth_ for w in words[50000:50010]]
# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
# # u'dirty', u'rims', u'artists']