mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
* Merge in changes from de branch
This commit is contained in:
commit
86c888667f
|
@ -20,6 +20,7 @@ from __future__ import unicode_literals
|
|||
from ast import literal_eval
|
||||
import math
|
||||
import gzip
|
||||
import json
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
|
@ -29,8 +30,6 @@ from shutil import copytree
|
|||
import codecs
|
||||
from collections import defaultdict
|
||||
|
||||
from spacy.en import get_lex_props
|
||||
from spacy.en.lemmatizer import Lemmatizer
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.vocab import write_binary_vectors
|
||||
from spacy.strings import hash_string
|
||||
|
@ -38,6 +37,13 @@ from preshed.counter import PreshCounter
|
|||
|
||||
from spacy.parts_of_speech import NOUN, VERB, ADJ
|
||||
|
||||
import spacy.en
|
||||
import spacy.de
|
||||
import spacy.fi
|
||||
import spacy.it
|
||||
|
||||
|
||||
|
||||
|
||||
def setup_tokenizer(lang_data_dir, tok_dir):
|
||||
if not tok_dir.exists():
|
||||
|
@ -139,7 +145,7 @@ def _read_senses(loc):
|
|||
return lexicon
|
||||
|
||||
|
||||
def setup_vocab(src_dir, dst_dir):
|
||||
def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
|
||||
if not dst_dir.exists():
|
||||
dst_dir.mkdir()
|
||||
|
||||
|
@ -148,13 +154,13 @@ def setup_vocab(src_dir, dst_dir):
|
|||
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
||||
else:
|
||||
print("Warning: Word vectors file not found")
|
||||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||
vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map)
|
||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||
probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob')
|
||||
if not probs:
|
||||
probs, oov_prob = _read_freqs(src_dir / 'freqs.txt')
|
||||
if not probs:
|
||||
oov_prob = 0.0
|
||||
oov_prob = -20
|
||||
else:
|
||||
oov_prob = min(probs.values())
|
||||
for word in clusters:
|
||||
|
@ -163,23 +169,32 @@ def setup_vocab(src_dir, dst_dir):
|
|||
|
||||
lexicon = []
|
||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||
entry = get_lex_props(word)
|
||||
entry['prob'] = float(prob)
|
||||
cluster = clusters.get(word, '0')
|
||||
lexeme = vocab[word]
|
||||
lexeme.prob = prob
|
||||
lexeme.is_oov = False
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
entry['cluster'] = int(cluster[::-1], 2)
|
||||
vocab[word] = entry
|
||||
if word in clusters:
|
||||
lexeme.cluster = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
lexeme.cluster = 0
|
||||
vocab.dump(str(dst_dir / 'lexemes.bin'))
|
||||
vocab.strings.dump(str(dst_dir / 'strings.txt'))
|
||||
with (dst_dir / 'oov_prob').open('w') as file_:
|
||||
file_.write('%f' % oov_prob)
|
||||
|
||||
|
||||
def main(lang_data_dir, corpora_dir, model_dir):
|
||||
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
|
||||
languages = {
|
||||
'en': spacy.en.English.default_lex_attrs(),
|
||||
'de': spacy.de.Deutsch.default_lex_attrs(),
|
||||
'fi': spacy.fi.Finnish.default_lex_attrs(),
|
||||
'it': spacy.it.Italian.default_lex_attrs(),
|
||||
}
|
||||
|
||||
model_dir = Path(model_dir)
|
||||
lang_data_dir = Path(lang_data_dir)
|
||||
corpora_dir = Path(corpora_dir)
|
||||
lang_data_dir = Path(lang_data_dir) / lang_id
|
||||
corpora_dir = Path(corpora_dir) / lang_id
|
||||
|
||||
assert corpora_dir.exists()
|
||||
assert lang_data_dir.exists()
|
||||
|
@ -187,13 +202,19 @@ def main(lang_data_dir, corpora_dir, model_dir):
|
|||
if not model_dir.exists():
|
||||
model_dir.mkdir()
|
||||
|
||||
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
|
||||
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
||||
setup_vocab(corpora_dir, model_dir / 'vocab')
|
||||
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')
|
||||
|
||||
if (lang_data_dir / 'gazetteer.json').exists():
|
||||
copyfile(str(lang_data_dir / 'gazetteer.json'),
|
||||
str(model_dir / 'vocab' / 'gazetteer.json'))
|
||||
if not (model_dir / 'wordnet').exists():
|
||||
|
||||
if (lang_data_dir / 'lemma_rules.json').exists():
|
||||
copyfile(str(lang_data_dir / 'lemma_rules.json'),
|
||||
str(model_dir / 'vocab' / 'lemma_rules.json'))
|
||||
|
||||
if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists():
|
||||
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
|
||||
|
||||
|
||||
|
|
|
@ -14,7 +14,6 @@ import re
|
|||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
|
@ -22,6 +21,11 @@ from spacy.gold import GoldParse
|
|||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
from spacy.syntax.arc_eager import ArcEager
|
||||
from spacy.syntax.ner import BiluoPushDown
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.syntax.parser import Parser
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
|
@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
beam_width=1, verbose=False,
|
||||
use_orig_arc_eager=False):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
if path.exists(ner_model_dir):
|
||||
shutil.rmtree(ner_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
os.mkdir(ner_model_dir)
|
||||
|
||||
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
||||
labels=ArcEager.get_labels(gold_tuples),
|
||||
beam_width=beam_width)
|
||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples),
|
||||
labels=BiluoPushDown.get_labels(gold_tuples),
|
||||
beam_width=0)
|
||||
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
|
||||
nlp = Language(data_dir=model_dir)
|
||||
|
||||
nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
|
||||
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
|
||||
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
|
@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc))
|
||||
nlp.end_training()
|
||||
nlp.end_training(model_dir)
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None):
|
||||
|
|
175
bin/tagger/train.py
Executable file
175
bin/tagger/train.py
Executable file
|
@ -0,0 +1,175 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from os import path
|
||||
import shutil
|
||||
import codecs
|
||||
import random
|
||||
|
||||
import plac
|
||||
import re
|
||||
|
||||
import spacy.util
|
||||
from spacy.en import English
|
||||
|
||||
from spacy.tagger import Tagger
|
||||
|
||||
from spacy.syntax.util import Config
|
||||
from spacy.gold import read_json_file
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
|
||||
def score_model(scorer, nlp, raw_text, annot_tuples):
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
nlp.tagger(tokens)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold)
|
||||
|
||||
|
||||
def _merge_sents(sents):
|
||||
m_deps = [[], [], [], [], [], []]
|
||||
m_brackets = []
|
||||
i = 0
|
||||
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
||||
m_deps[0].extend(id_ + i for id_ in ids)
|
||||
m_deps[1].extend(words)
|
||||
m_deps[2].extend(tags)
|
||||
m_deps[3].extend(head + i for head in heads)
|
||||
m_deps[4].extend(labels)
|
||||
m_deps[5].extend(ner)
|
||||
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
||||
i += len(ids)
|
||||
return [(m_deps, m_brackets)]
|
||||
|
||||
|
||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
|
||||
beam_width=1, verbose=False,
|
||||
use_orig_arc_eager=False):
|
||||
if n_sents > 0:
|
||||
gold_tuples = gold_tuples[:n_sents]
|
||||
|
||||
templates = Tagger.default_templates()
|
||||
nlp = Language(data_dir=model_dir, tagger=False)
|
||||
nlp.tagger = Tagger.blank(nlp.vocab, templates)
|
||||
|
||||
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
|
||||
for itn in range(n_iter):
|
||||
scorer = Scorer()
|
||||
loss = 0
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, ctnt in sents:
|
||||
words = annot_tuples[1]
|
||||
gold_tags = annot_tuples[2]
|
||||
score_model(scorer, nlp, raw_text, annot_tuples)
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(words)
|
||||
else:
|
||||
tokens = nlp.tokenizer(raw_text)
|
||||
loss += nlp.tagger.train(tokens, gold_tags)
|
||||
random.shuffle(gold_tuples)
|
||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc))
|
||||
nlp.end_training(model_dir)
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
scorer = Scorer()
|
||||
for raw_text, sents in gold_tuples:
|
||||
if gold_preproc:
|
||||
raw_text = None
|
||||
else:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=verbose)
|
||||
return scorer
|
||||
|
||||
|
||||
def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None):
|
||||
nlp = Language(data_dir=model_dir)
|
||||
if beam_width is not None:
|
||||
nlp.parser.cfg.beam_width = beam_width
|
||||
gold_tuples = read_json_file(dev_loc)
|
||||
scorer = Scorer()
|
||||
out_file = codecs.open(out_loc, 'w', 'utf8')
|
||||
for raw_text, sents in gold_tuples:
|
||||
sents = _merge_sents(sents)
|
||||
for annot_tuples, brackets in sents:
|
||||
if raw_text is None:
|
||||
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
|
||||
nlp.tagger(tokens)
|
||||
nlp.entity(tokens)
|
||||
nlp.parser(tokens)
|
||||
else:
|
||||
tokens = nlp(raw_text, merge_mwes=False)
|
||||
gold = GoldParse(tokens, annot_tuples)
|
||||
scorer.score(tokens, gold, verbose=False)
|
||||
for t in tokens:
|
||||
out_file.write(
|
||||
'%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_)
|
||||
)
|
||||
return scorer
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
train_loc=("Location of training file or directory"),
|
||||
dev_loc=("Location of development file or directory"),
|
||||
model_dir=("Location of output model directory",),
|
||||
eval_only=("Skip training, and only evaluate", "flag", "e", bool),
|
||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
||||
gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
|
||||
out_loc=("Out location", "option", "o", str),
|
||||
n_sents=("Number of training sentences", "option", "n", int),
|
||||
n_iter=("Number of training iterations", "option", "i", int),
|
||||
verbose=("Verbose error reporting", "flag", "v", bool),
|
||||
debug=("Debug mode", "flag", "d", bool),
|
||||
)
|
||||
def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
|
||||
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False):
|
||||
if not eval_only:
|
||||
gold_train = list(read_json_file(train_loc))
|
||||
train(English, gold_train, model_dir,
|
||||
feat_set='basic' if not debug else 'debug',
|
||||
gold_preproc=gold_preproc, n_sents=n_sents,
|
||||
corruption_level=corruption_level, n_iter=n_iter,
|
||||
verbose=verbose)
|
||||
#if out_loc:
|
||||
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
|
||||
scorer = evaluate(English, list(read_json_file(dev_loc)),
|
||||
model_dir, gold_preproc=gold_preproc, verbose=verbose)
|
||||
print('TOK', scorer.token_acc)
|
||||
print('POS', scorer.tags_acc)
|
||||
print('UAS', scorer.uas)
|
||||
print('LAS', scorer.las)
|
||||
|
||||
print('NER P', scorer.ents_p)
|
||||
print('NER R', scorer.ents_r)
|
||||
print('NER F', scorer.ents_f)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
3
lang_data/de/infix.txt
Normal file
3
lang_data/de/infix.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
\.\.\.
|
||||
(?<=[a-z])\.(?=[A-Z])
|
||||
(?<=[a-zA-Z])-(?=[a-zA-z])
|
0
lang_data/de/lemma_rules.json
Normal file
0
lang_data/de/lemma_rules.json
Normal file
0
lang_data/de/morphs.json
Normal file
0
lang_data/de/morphs.json
Normal file
21
lang_data/de/prefix.txt
Normal file
21
lang_data/de/prefix.txt
Normal file
|
@ -0,0 +1,21 @@
|
|||
,
|
||||
"
|
||||
(
|
||||
[
|
||||
{
|
||||
*
|
||||
<
|
||||
$
|
||||
£
|
||||
“
|
||||
'
|
||||
``
|
||||
`
|
||||
#
|
||||
US$
|
||||
C$
|
||||
A$
|
||||
a-
|
||||
‘
|
||||
....
|
||||
...
|
3
lang_data/de/sample.txt
Normal file
3
lang_data/de/sample.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern.
|
||||
|
||||
Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs.
|
149
lang_data/de/specials.json
Normal file
149
lang_data/de/specials.json
Normal file
|
@ -0,0 +1,149 @@
|
|||
{
|
||||
"a.m.": [{"F": "a.m."}],
|
||||
"p.m.": [{"F": "p.m."}],
|
||||
|
||||
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
|
||||
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
|
||||
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
|
||||
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
|
||||
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
|
||||
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
|
||||
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
|
||||
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
|
||||
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
|
||||
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
|
||||
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
|
||||
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
|
||||
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
|
||||
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
|
||||
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
|
||||
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
|
||||
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
|
||||
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
|
||||
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
|
||||
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
|
||||
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
|
||||
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
|
||||
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
|
||||
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
|
||||
|
||||
|
||||
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
|
||||
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
|
||||
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
|
||||
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
|
||||
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
|
||||
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
|
||||
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
|
||||
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
|
||||
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
|
||||
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
|
||||
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
|
||||
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
|
||||
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
|
||||
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
|
||||
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
|
||||
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
|
||||
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
|
||||
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
|
||||
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
|
||||
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
|
||||
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
|
||||
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
|
||||
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
|
||||
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
|
||||
|
||||
"Jan.": [{"F": "Jan.", "L": "Januar"}],
|
||||
"Feb.": [{"F": "Feb.", "L": "Februar"}],
|
||||
"Mär.": [{"F": "Mär.", "L": "März"}],
|
||||
"Apr.": [{"F": "Apr.", "L": "April"}],
|
||||
"Mai.": [{"F": "Mai.", "L": "Mai"}],
|
||||
"Jun.": [{"F": "Jun.", "L": "Juni"}],
|
||||
"Jul.": [{"F": "Jul.", "L": "Juli"}],
|
||||
"Aug.": [{"F": "Aug.", "L": "August"}],
|
||||
"Sep.": [{"F": "Sep.", "L": "September"}],
|
||||
"Sept.": [{"F": "Sept.", "L": "September"}],
|
||||
"Okt.": [{"F": "Okt.", "L": "Oktober"}],
|
||||
"Nov.": [{"F": "Nov.", "L": "November"}],
|
||||
"Dez.": [{"F": "Dez.", "L": "Dezember"}],
|
||||
|
||||
":)": [{"F": ":)"}],
|
||||
"<3": [{"F": "<3"}],
|
||||
";)": [{"F": ";)"}],
|
||||
"(:": [{"F": "(:"}],
|
||||
":(": [{"F": ":("}],
|
||||
"-_-": [{"F": "-_-"}],
|
||||
"=)": [{"F": "=)"}],
|
||||
":/": [{"F": ":/"}],
|
||||
":>": [{"F": ":>"}],
|
||||
";-)": [{"F": ";-)"}],
|
||||
":Y": [{"F": ":Y"}],
|
||||
":P": [{"F": ":P"}],
|
||||
":-P": [{"F": ":-P"}],
|
||||
":3": [{"F": ":3"}],
|
||||
"=3": [{"F": "=3"}],
|
||||
"xD": [{"F": "xD"}],
|
||||
"^_^": [{"F": "^_^"}],
|
||||
"=]": [{"F": "=]"}],
|
||||
"=D": [{"F": "=D"}],
|
||||
"<333": [{"F": "<333"}],
|
||||
":))": [{"F": ":))"}],
|
||||
":0": [{"F": ":0"}],
|
||||
"-__-": [{"F": "-__-"}],
|
||||
"xDD": [{"F": "xDD"}],
|
||||
"o_o": [{"F": "o_o"}],
|
||||
"o_O": [{"F": "o_O"}],
|
||||
"V_V": [{"F": "V_V"}],
|
||||
"=[[": [{"F": "=[["}],
|
||||
"<33": [{"F": "<33"}],
|
||||
";p": [{"F": ";p"}],
|
||||
";D": [{"F": ";D"}],
|
||||
";-p": [{"F": ";-p"}],
|
||||
";(": [{"F": ";("}],
|
||||
":p": [{"F": ":p"}],
|
||||
":]": [{"F": ":]"}],
|
||||
":O": [{"F": ":O"}],
|
||||
":-/": [{"F": ":-/"}],
|
||||
":-)": [{"F": ":-)"}],
|
||||
":(((": [{"F": ":((("}],
|
||||
":((": [{"F": ":(("}],
|
||||
":')": [{"F": ":')"}],
|
||||
"(^_^)": [{"F": "(^_^)"}],
|
||||
"(=": [{"F": "(="}],
|
||||
"o.O": [{"F": "o.O"}],
|
||||
"\")": [{"F": "\")"}],
|
||||
"a.": [{"F": "a."}],
|
||||
"b.": [{"F": "b."}],
|
||||
"c.": [{"F": "c."}],
|
||||
"d.": [{"F": "d."}],
|
||||
"e.": [{"F": "e."}],
|
||||
"f.": [{"F": "f."}],
|
||||
"g.": [{"F": "g."}],
|
||||
"h.": [{"F": "h."}],
|
||||
"i.": [{"F": "i."}],
|
||||
"j.": [{"F": "j."}],
|
||||
"k.": [{"F": "k."}],
|
||||
"l.": [{"F": "l."}],
|
||||
"m.": [{"F": "m."}],
|
||||
"n.": [{"F": "n."}],
|
||||
"o.": [{"F": "o."}],
|
||||
"p.": [{"F": "p."}],
|
||||
"q.": [{"F": "q."}],
|
||||
"s.": [{"F": "s."}],
|
||||
"t.": [{"F": "t."}],
|
||||
"u.": [{"F": "u."}],
|
||||
"v.": [{"F": "v."}],
|
||||
"w.": [{"F": "w."}],
|
||||
"x.": [{"F": "x."}],
|
||||
"y.": [{"F": "y."}],
|
||||
"z.": [{"F": "z."}],
|
||||
|
||||
"z.b.": [{"F": "z.b."}],
|
||||
"e.h.": [{"F": "I.e."}],
|
||||
"o.ä.": [{"F": "I.E."}],
|
||||
"bzw.": [{"F": "bzw."}],
|
||||
"usw.": [{"F": "usw."}],
|
||||
"\n": [{"F": "\n", "pos": "SP"}],
|
||||
"\t": [{"F": "\t", "pos": "SP"}],
|
||||
" ": [{"F": " ", "pos": "SP"}]
|
||||
}
|
26
lang_data/de/suffix.txt
Normal file
26
lang_data/de/suffix.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
,
|
||||
\"
|
||||
\)
|
||||
\]
|
||||
\}
|
||||
\*
|
||||
\!
|
||||
\?
|
||||
%
|
||||
\$
|
||||
>
|
||||
:
|
||||
;
|
||||
'
|
||||
”
|
||||
''
|
||||
's
|
||||
'S
|
||||
’s
|
||||
’S
|
||||
’
|
||||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-z0-9)\]"'%\)])\.
|
||||
(?<=[0-9])km
|
56
lang_data/de/tag_map.json
Normal file
56
lang_data/de/tag_map.json
Normal file
|
@ -0,0 +1,56 @@
|
|||
{
|
||||
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
|
||||
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
|
||||
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
|
||||
"ADJA": {"pos": "ADJ"},
|
||||
"ADJD": {"pos": "ADJ", "Variant": "Short"},
|
||||
"ADV": {"pos": "ADV"},
|
||||
"APPO": {"pos": "ADP", "AdpType": "Post"},
|
||||
"APPR": {"pos": "ADP", "AdpType": "Prep"},
|
||||
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
|
||||
"APZR": {"pos": "ADP", "AdpType": "Circ"},
|
||||
"ART": {"pos": "DET", "PronType": "Art"},
|
||||
"CARD": {"pos": "NUM", "NumType": "Card"},
|
||||
"FM": {"pos": "X", "Foreign": "Yes"},
|
||||
"ITJ": {"pos": "INTJ"},
|
||||
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
|
||||
"KON": {"pos": "CONJ"},
|
||||
"KOUI": {"pos": "SCONJ"},
|
||||
"KOUS": {"pos": "SCONJ"},
|
||||
"NE": {"pos": "PROPN"},
|
||||
"NN": {"pos": "NOUN"},
|
||||
"PAV": {"pos": "ADV", "PronType": "Dem"},
|
||||
"PDAT": {"pos": "DET", "PronType": "Dem"},
|
||||
"PDS": {"pos": "PRON", "PronType": "Dem"},
|
||||
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
|
||||
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
|
||||
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
|
||||
"PPER": {"pos": "PRON", "PronType": "Prs"},
|
||||
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
|
||||
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
|
||||
"PRELAT": {"pos": "DET", "PronType": "Rel"},
|
||||
"PRELS": {"pos": "PRON", "PronType": "Rel"},
|
||||
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
|
||||
"PTKA": {"pos": "PART"},
|
||||
"PTKANT": {"pos": "PART", "PartType": "Res"},
|
||||
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
|
||||
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
|
||||
"PTKZU": {"pos": "PART", "PartType": "Inf"},
|
||||
"PWAT": {"pos": "DET", "PronType": "Int"},
|
||||
"PWAV": {"pos": "ADV", "PronType": "Int"},
|
||||
"PWS": {"pos": "PRON", "PronType": "Int"},
|
||||
"TRUNC": {"pos": "X", "Hyph": "Yes"},
|
||||
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
|
||||
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
|
||||
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
|
||||
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
|
||||
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
|
||||
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
|
||||
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
|
||||
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
|
||||
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
|
||||
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
|
||||
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
|
||||
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
|
||||
"XY": {"pos": "X"}
|
||||
}
|
31
lang_data/en/lemma_rules.json
Normal file
31
lang_data/en/lemma_rules.json
Normal file
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"noun": [
|
||||
["s", ""],
|
||||
["ses", "s"],
|
||||
["ves", "f"],
|
||||
["xes", "x"],
|
||||
["zes", "z"],
|
||||
["ches", "ch"],
|
||||
["shes", "sh"],
|
||||
["men", "man"],
|
||||
["ies", "y"]
|
||||
],
|
||||
|
||||
"verb": [
|
||||
["s", ""],
|
||||
["ies", "y"],
|
||||
["es", "e"],
|
||||
["es", ""],
|
||||
["ed", "e"],
|
||||
["ed", ""],
|
||||
["ing", "e"],
|
||||
["ing", ""]
|
||||
],
|
||||
|
||||
"adj": [
|
||||
["er", ""],
|
||||
["est", ""],
|
||||
["er", "e"],
|
||||
["est", "e"]
|
||||
]
|
||||
}
|
60
lang_data/en/tag_map.json
Normal file
60
lang_data/en/tag_map.json
Normal file
|
@ -0,0 +1,60 @@
|
|||
{
|
||||
".": {"pos": "punct", "puncttype": "peri"},
|
||||
",": {"pos": "punct", "puncttype": "comm"},
|
||||
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
|
||||
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
|
||||
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
|
||||
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||
":": {"pos": "punct"},
|
||||
"$": {"pos": "sym", "other": {"symtype": "currency"}},
|
||||
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
|
||||
"AFX": {"pos": "adj", "hyph": "hyph"},
|
||||
"CC": {"pos": "conj", "conjtype": "coor"},
|
||||
"CD": {"pos": "num", "numtype": "card"},
|
||||
"DT": {"pos": "adj", "prontype": "prn"},
|
||||
"EX": {"pos": "adv", "advtype": "ex"},
|
||||
"FW": {"pos": "x", "foreign": "foreign"},
|
||||
"HYPH": {"pos": "punct", "puncttype": "dash"},
|
||||
"IN": {"pos": "adp"},
|
||||
"JJ": {"pos": "adj", "degree": "pos"},
|
||||
"JJR": {"pos": "adj", "degree": "comp"},
|
||||
"JJS": {"pos": "adj", "degree": "sup"},
|
||||
"LS": {"pos": "punct", "numtype": "ord"},
|
||||
"MD": {"pos": "verb", "verbtype": "mod"},
|
||||
"NIL": {"pos": "no_tag"},
|
||||
"NN": {"pos": "noun", "number": "sing"},
|
||||
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
|
||||
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
|
||||
"NNS": {"pos": "noun", "number": "plur"},
|
||||
"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"},
|
||||
"POS": {"pos": "part", "poss": "poss"},
|
||||
"PRP": {"pos": "noun", "prontype": "prs"},
|
||||
"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"},
|
||||
"RB": {"pos": "adv", "degree": "pos"},
|
||||
"RBR": {"pos": "adv", "degree": "comp"},
|
||||
"RBS": {"pos": "adv", "degree": "sup"},
|
||||
"RP": {"pos": "part"},
|
||||
"SYM": {"pos": "sym"},
|
||||
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
|
||||
"UH": {"pos": "intJ"},
|
||||
"VB": {"pos": "verb", "verbform": "inf"},
|
||||
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
|
||||
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
|
||||
"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"},
|
||||
"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"},
|
||||
"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3},
|
||||
"WDT": {"pos": "adj", "prontype": "int|rel"},
|
||||
"WP": {"pos": "noun", "prontype": "int|rel"},
|
||||
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
|
||||
"WRB": {"pos": "adv", "prontype": "int|rel"},
|
||||
"SP": {"pos": "space"},
|
||||
"ADD": {"pos": "x"},
|
||||
"NFP": {"pos": "punct"},
|
||||
"GW": {"pos": "x"},
|
||||
"AFX": {"pos": "x"},
|
||||
"HYPH": {"pos": "punct"},
|
||||
"XX": {"pos": "x"},
|
||||
"BES": {"pos": "verb"},
|
||||
"HVS": {"pos": "verb"}
|
||||
}
|
2
setup.py
2
setup.py
|
@ -153,7 +153,7 @@ def main(modules, is_pypy):
|
|||
|
||||
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
||||
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
|
||||
'spacy.morphology',
|
||||
'spacy.morphology', 'spacy.tagger',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy._ml', 'spacy._theano',
|
||||
'spacy.tokenizer', 'spacy.en.attrs',
|
||||
|
|
|
@ -91,6 +91,8 @@ cdef class Model:
|
|||
count_feats(counts[guess], feats, n_feats, -cost)
|
||||
self._model.update(counts)
|
||||
|
||||
def end_training(self):
|
||||
def end_training(self, model_loc=None):
|
||||
if model_loc is None:
|
||||
model_loc = self.model_loc
|
||||
self._model.end_training()
|
||||
self._model.dump(self.model_loc, freq_thresh=0)
|
||||
self._model.dump(model_loc, freq_thresh=0)
|
||||
|
|
|
@ -84,3 +84,4 @@ cpdef enum attr_id_t:
|
|||
ENT_TYPE
|
||||
HEAD
|
||||
SPACY
|
||||
PROB
|
||||
|
|
|
@ -1,181 +1,12 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
import re
|
||||
import struct
|
||||
import json
|
||||
|
||||
from .. import orth
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..syntax.arc_eager import ArcEager
|
||||
from ..syntax.ner import BiluoPushDown
|
||||
from ..syntax.parser import ParserFactory
|
||||
from ..serialize.bits import BitArray
|
||||
from ..matcher import Matcher
|
||||
from ..language import Language
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..multi_words import RegexMerger
|
||||
|
||||
from .pos import EnPosTagger
|
||||
from .pos import POS_TAGS
|
||||
from .attrs import get_flags
|
||||
from . import regexes
|
||||
|
||||
from ..util import read_lang_data
|
||||
|
||||
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
|
||||
|
||||
def get_lex_props(string, oov_prob=-30, is_oov=False):
|
||||
return {
|
||||
'flags': get_flags(string, is_oov=is_oov),
|
||||
'length': len(string),
|
||||
'orth': string,
|
||||
'lower': string.lower(),
|
||||
'norm': string,
|
||||
'shape': orth.word_shape(string),
|
||||
'prefix': string[0],
|
||||
'suffix': string[-3:],
|
||||
'cluster': 0,
|
||||
'prob': oov_prob,
|
||||
'sentiment': 0
|
||||
}
|
||||
|
||||
if_model_present = -1
|
||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
|
||||
class English(object):
|
||||
"""The English NLP pipeline.
|
||||
|
||||
Example:
|
||||
|
||||
Load data from default directory:
|
||||
|
||||
>>> nlp = English()
|
||||
>>> nlp = English(data_dir=u'')
|
||||
|
||||
Load data from specified directory:
|
||||
|
||||
>>> nlp = English(data_dir=u'path/to/data_directory')
|
||||
|
||||
Disable (and avoid loading) parts of the processing pipeline:
|
||||
|
||||
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
|
||||
|
||||
Start with nothing loaded:
|
||||
|
||||
>>> nlp = English(data_dir=None)
|
||||
"""
|
||||
ParserTransitionSystem = ArcEager
|
||||
EntityTransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(self,
|
||||
data_dir=LOCAL_DATA_DIR,
|
||||
Tokenizer=Tokenizer.from_dir,
|
||||
Tagger=EnPosTagger,
|
||||
Parser=ParserFactory(ParserTransitionSystem),
|
||||
Entity=ParserFactory(EntityTransitionSystem),
|
||||
Matcher=Matcher.from_dir,
|
||||
Packer=None,
|
||||
load_vectors=True
|
||||
):
|
||||
self.data_dir = data_dir
|
||||
|
||||
if path.exists(path.join(data_dir, 'vocab', 'oov_prob')):
|
||||
oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read())
|
||||
else:
|
||||
oov_prob = None
|
||||
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
||||
pos_tags=POS_TAGS,
|
||||
oov_prob=oov_prob)
|
||||
if Tagger is True:
|
||||
Tagger = EnPosTagger
|
||||
if Parser is True:
|
||||
transition_system = self.ParserTransitionSystem
|
||||
Parser = lambda s, d: parser.Parser(s, d, transition_system)
|
||||
if Entity is True:
|
||||
transition_system = self.EntityTransitionSystem
|
||||
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
||||
|
||||
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
||||
|
||||
if Tagger and path.exists(path.join(data_dir, 'pos')):
|
||||
self.tagger = Tagger(self.vocab.strings, data_dir)
|
||||
else:
|
||||
self.tagger = None
|
||||
if Parser and path.exists(path.join(data_dir, 'deps')):
|
||||
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
|
||||
else:
|
||||
self.parser = None
|
||||
if Entity and path.exists(path.join(data_dir, 'ner')):
|
||||
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
||||
else:
|
||||
self.entity = None
|
||||
if Matcher:
|
||||
self.matcher = Matcher(self.vocab, data_dir)
|
||||
else:
|
||||
self.matcher = None
|
||||
if Packer:
|
||||
self.packer = Packer(self.vocab, data_dir)
|
||||
else:
|
||||
self.packer = None
|
||||
self.mwe_merger = RegexMerger([
|
||||
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||
('CD', 'TIME', regexes.TIME_RE),
|
||||
('NNP', 'DATE', regexes.DAYS_RE),
|
||||
('CD', 'MONEY', regexes.MONEY_RE)])
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
|
||||
Returns:
|
||||
tokens (spacy.tokens.Doc):
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
tokens = self.tokenizer(text)
|
||||
if self.tagger and tag:
|
||||
self.tagger(tokens)
|
||||
if self.matcher and entity:
|
||||
self.matcher(tokens)
|
||||
if self.parser and parse:
|
||||
self.parser(tokens)
|
||||
if self.entity and entity:
|
||||
self.entity(tokens)
|
||||
if merge_mwes and self.mwe_merger is not None:
|
||||
self.mwe_merger(tokens)
|
||||
return tokens
|
||||
|
||||
def end_training(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.data_dir
|
||||
self.parser.model.end_training()
|
||||
self.entity.model.end_training()
|
||||
self.tagger.model.end_training()
|
||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||
file_.write(
|
||||
json.dumps([
|
||||
(TAG, list(self.tagger.freqs[TAG].items())),
|
||||
(DEP, list(self.parser.moves.freqs[DEP].items())),
|
||||
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
|
||||
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
|
||||
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
|
||||
|
||||
@property
|
||||
def tags(self):
|
||||
"""Deprecated. List of part-of-speech tag names."""
|
||||
return self.tagger.tag_names
|
||||
class English(Language):
|
||||
@classmethod
|
||||
def default_data_dir(cls):
|
||||
return LOCAL_DATA_DIR
|
||||
|
|
|
@ -1,105 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
import codecs
|
||||
|
||||
|
||||
NOUN_RULES = (
|
||||
('s', ''),
|
||||
('ses', 's'),
|
||||
('ves', 'f'),
|
||||
('xes', 'x'),
|
||||
('zes', 'z'),
|
||||
('ches', 'ch'),
|
||||
('shes', 'sh'),
|
||||
('men', 'man'),
|
||||
('ies', 'y')
|
||||
)
|
||||
|
||||
|
||||
VERB_RULES = (
|
||||
("s", ""),
|
||||
("ies", "y"),
|
||||
("es", "e"),
|
||||
("es", ""),
|
||||
("ed", "e"),
|
||||
("ed", ""),
|
||||
("ing", "e"),
|
||||
("ing", "")
|
||||
)
|
||||
|
||||
|
||||
ADJ_RULES = (
|
||||
("er", ""),
|
||||
("est", ""),
|
||||
("er", "e"),
|
||||
("est", "e")
|
||||
)
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
|
||||
self.noun_id = noun_id
|
||||
self.verb_id = verb_id
|
||||
self.adj_id = adj_id
|
||||
self.index = {}
|
||||
self.exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
||||
|
||||
def __call__(self, string, pos):
|
||||
if pos == self.noun_id:
|
||||
return self.noun(string)
|
||||
elif pos == self.verb_id:
|
||||
return self.verb(string)
|
||||
elif pos == self.adj_id:
|
||||
return self.adj(string)
|
||||
else:
|
||||
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
|
||||
|
||||
def noun(self, string):
|
||||
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
|
||||
|
||||
def verb(self, string):
|
||||
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
|
||||
|
||||
def adj(self, string):
|
||||
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index:
|
||||
forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
def read_index(loc):
|
||||
index = set()
|
||||
for line in codecs.open(loc, 'r', 'utf8'):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
if word.count('_') == 0:
|
||||
index.add(word)
|
||||
return index
|
||||
|
||||
|
||||
def read_exc(loc):
|
||||
exceptions = {}
|
||||
for line in codecs.open(loc, 'r', 'utf8'):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||
return exceptions
|
|
@ -1,26 +1,5 @@
|
|||
from preshed.maps cimport PreshMapArray
|
||||
from preshed.counter cimport PreshCounter
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .._ml cimport Model
|
||||
from ..strings cimport StringStore
|
||||
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from .lemmatizer import Lemmatizer
|
||||
from ..tagger cimport Tagger
|
||||
|
||||
|
||||
cdef class EnPosTagger:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef readonly Model model
|
||||
cdef public object lemmatizer
|
||||
cdef PreshMapArray _morph_cache
|
||||
cdef public dict freqs
|
||||
|
||||
cdef PosTag* tags
|
||||
cdef readonly object tag_names
|
||||
cdef readonly object tag_map
|
||||
cdef readonly int n_tags
|
||||
|
||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||
cdef class EnPosTagger(Tagger):
|
||||
pass
|
||||
|
|
388
spacy/en/pos.pyx
388
spacy/en/pos.pyx
|
@ -1,389 +1,11 @@
|
|||
from os import path
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from libc.string cimport memset
|
||||
from ..parts_of_speech cimport NOUN, VERB, ADJ
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from thinc.typedefs cimport atom_t, weight_t
|
||||
from collections import defaultdict
|
||||
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||
|
||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
|
||||
from ..structs cimport TokenC, Morphology, LexemeC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..morphology cimport set_morph_from_dict
|
||||
from .._ml cimport arg_max
|
||||
|
||||
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
from ..lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
cpdef enum en_person_t:
|
||||
NO_PERSON
|
||||
FIRST
|
||||
SECOND
|
||||
THIRD
|
||||
NON_THIRD
|
||||
|
||||
|
||||
cpdef enum en_number_t:
|
||||
NO_NUMBER
|
||||
SINGULAR
|
||||
PLURAL
|
||||
MASS
|
||||
|
||||
|
||||
cpdef enum en_gender_t:
|
||||
NO_GENDER
|
||||
MASCULINE
|
||||
FEMININE
|
||||
NEUTER
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
GENITIVE
|
||||
ACCUSATIVE
|
||||
REFLEXIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum en_tenspect_t:
|
||||
NO_TENSE
|
||||
BASE_VERB
|
||||
PRESENT
|
||||
PAST
|
||||
PASSIVE
|
||||
ING
|
||||
MODAL
|
||||
|
||||
|
||||
cpdef enum misc_t:
|
||||
NO_MISC
|
||||
COMPARATIVE
|
||||
SUPERLATIVE
|
||||
RELATIVE
|
||||
NAME
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_orth
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_flags
|
||||
|
||||
P1_orth
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_flags
|
||||
|
||||
W_orth
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_flags
|
||||
|
||||
N1_orth
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_flags
|
||||
|
||||
N2_orth
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_flags
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
POS_TAGS = {
|
||||
'NULL': (NO_TAG, {}),
|
||||
'EOL': (EOL, {}),
|
||||
'CC': (CONJ, {}),
|
||||
'CD': (NUM, {}),
|
||||
'DT': (DET, {}),
|
||||
'EX': (DET, {}),
|
||||
'FW': (X, {}),
|
||||
'IN': (ADP, {}),
|
||||
'JJ': (ADJ, {}),
|
||||
'JJR': (ADJ, {'misc': COMPARATIVE}),
|
||||
'JJS': (ADJ, {'misc': SUPERLATIVE}),
|
||||
'LS': (X, {}),
|
||||
'MD': (VERB, {'tenspect': MODAL}),
|
||||
'NN': (NOUN, {}),
|
||||
'NNS': (NOUN, {'number': PLURAL}),
|
||||
'NNP': (NOUN, {'misc': NAME}),
|
||||
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
|
||||
'PDT': (DET, {}),
|
||||
'POS': (PRT, {'case': GENITIVE}),
|
||||
'PRP': (PRON, {}),
|
||||
'PRP$': (PRON, {'case': GENITIVE}),
|
||||
'RB': (ADV, {}),
|
||||
'RBR': (ADV, {'misc': COMPARATIVE}),
|
||||
'RBS': (ADV, {'misc': SUPERLATIVE}),
|
||||
'RP': (PRT, {}),
|
||||
'SYM': (X, {}),
|
||||
'TO': (PRT, {}),
|
||||
'UH': (X, {}),
|
||||
'VB': (VERB, {}),
|
||||
'VBD': (VERB, {'tenspect': PAST}),
|
||||
'VBG': (VERB, {'tenspect': ING}),
|
||||
'VBN': (VERB, {'tenspect': PASSIVE}),
|
||||
'VBP': (VERB, {'tenspect': PRESENT}),
|
||||
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
'WDT': (DET, {'misc': RELATIVE}),
|
||||
'WP': (PRON, {'misc': RELATIVE}),
|
||||
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
|
||||
'WRB': (ADV, {'misc': RELATIVE}),
|
||||
'!': (PUNCT, {}),
|
||||
'#': (PUNCT, {}),
|
||||
'$': (PUNCT, {}),
|
||||
"''": (PUNCT, {}),
|
||||
"(": (PUNCT, {}),
|
||||
")": (PUNCT, {}),
|
||||
"-LRB-": (PUNCT, {}),
|
||||
"-RRB-": (PUNCT, {}),
|
||||
".": (PUNCT, {}),
|
||||
",": (PUNCT, {}),
|
||||
"``": (PUNCT, {}),
|
||||
":": (PUNCT, {}),
|
||||
"?": (PUNCT, {}),
|
||||
"ADD": (X, {}),
|
||||
"NFP": (PUNCT, {}),
|
||||
"GW": (X, {}),
|
||||
"AFX": (X, {}),
|
||||
"HYPH": (PUNCT, {}),
|
||||
"XX": (X, {}),
|
||||
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
"SP": (SPACE, {})
|
||||
}
|
||||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_flags,),
|
||||
(N1_flags,),
|
||||
(N2_flags,),
|
||||
(P1_flags,),
|
||||
(P2_flags,),
|
||||
)
|
||||
|
||||
|
||||
cdef struct _CachedMorph:
|
||||
Morphology morph
|
||||
int lemma
|
||||
|
||||
|
||||
def setup_model_dir(tag_names, tag_map, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
'tag_map': tag_map
|
||||
}
|
||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||
json.dump(config, file_)
|
||||
|
||||
|
||||
cdef class EnPosTagger:
|
||||
cdef class EnPosTagger(Tagger):
|
||||
"""A part-of-speech tagger for English"""
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
self.mem = Pool()
|
||||
model_dir = path.join(data_dir, 'pos')
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
self.tag_names = sorted(cfg['tag_names'])
|
||||
assert self.tag_names
|
||||
self.n_tags = len(self.tag_names)
|
||||
self.tag_map = cfg['tag_map']
|
||||
cdef int n_tags = len(self.tag_names) + 1
|
||||
|
||||
self.model = Model(n_tags, cfg['templates'], model_dir)
|
||||
self._morph_cache = PreshMapArray(n_tags)
|
||||
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
||||
for i, tag in enumerate(sorted(self.tag_names)):
|
||||
pos, props = self.tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
set_morph_from_dict(&self.tags[i].morph, props)
|
||||
if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
|
||||
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
|
||||
'morphs.json'))))
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Args:
|
||||
tokens (Doc): The tokens to be tagged.
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef const weight_t* scores
|
||||
for i in range(tokens.length):
|
||||
if tokens.data[i].pos == 0:
|
||||
fill_context(context, i, tokens.data)
|
||||
scores = self.model.score(context)
|
||||
guess = arg_max(scores, self.model.n_classes)
|
||||
tokens.data[i].tag = self.strings[self.tag_names[guess]]
|
||||
self.set_morph(i, &self.tags[guess], tokens.data)
|
||||
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
tokens.data[i].tag = self.strings[tag_strs[i]]
|
||||
self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
|
||||
tokens.data)
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
cdef int i
|
||||
cdef int loss
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef const weight_t* scores
|
||||
golds = [self.tag_names.index(g) if g is not None else -1
|
||||
for g in gold_tag_strs]
|
||||
correct = 0
|
||||
for i in range(tokens.length):
|
||||
fill_context(context, i, tokens.data)
|
||||
scores = self.model.score(context)
|
||||
guess = arg_max(scores, self.model.n_classes)
|
||||
loss = guess != golds[i] if golds[i] != -1 else 0
|
||||
self.model.update(context, guess, golds[i], loss)
|
||||
tokens.data[i].tag = self.strings[self.tag_names[guess]]
|
||||
self.set_morph(i, &self.tags[guess], tokens.data)
|
||||
correct += loss == 0
|
||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||
return correct
|
||||
|
||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
|
||||
tokens[i].pos = tag.pos
|
||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
|
||||
if cached is NULL:
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.orth
|
||||
cdef unicode py_string = self.strings[lex.orth]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.orth
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
return lemma
|
||||
|
||||
def load_morph_exceptions(self, dict exc):
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef attr_t orth
|
||||
cdef int pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
orth = self.strings[form_str]
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._morph_cache.set(pos, orth, <void*>cached)
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&context[W_orth], &tokens[i])
|
||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.lower
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.tag
|
||||
context[6] = t.lemma
|
||||
if t.lex.flags & (1 << IS_ALPHA):
|
||||
context[7] = 1
|
||||
elif t.lex.flags & (1 << IS_PUNCT):
|
||||
context[7] = 2
|
||||
elif t.lex.flags & (1 << LIKE_URL):
|
||||
context[7] = 3
|
||||
elif t.lex.flags & (1 << LIKE_NUM):
|
||||
context[7] = 4
|
||||
else:
|
||||
context[7] = 0
|
||||
def make_lemmatizer(self, data_dir):
|
||||
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
|
|
11
spacy/fi/__init__.py
Normal file
11
spacy/fi/__init__.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
@classmethod
|
||||
def default_data_dir(cls):
|
||||
return path.join(path.dirname(__file__), 'data')
|
252
spacy/language.py
Normal file
252
spacy/language.py
Normal file
|
@ -0,0 +1,252 @@
|
|||
from os import path
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .vocab import Vocab
|
||||
from .syntax.parser import Parser
|
||||
from .tagger import Tagger
|
||||
from .matcher import Matcher
|
||||
from .serialize.packer import Packer
|
||||
from ._ml import Model
|
||||
from . import attrs
|
||||
from . import orth
|
||||
from .syntax.ner import BiluoPushDown
|
||||
from .syntax.arc_eager import ArcEager
|
||||
|
||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||
|
||||
|
||||
class Language(object):
|
||||
@staticmethod
|
||||
def lower(string):
|
||||
return string.lower()
|
||||
|
||||
@staticmethod
|
||||
def norm(string):
|
||||
return string
|
||||
|
||||
@staticmethod
|
||||
def shape(string):
|
||||
return orth.word_shape(string)
|
||||
|
||||
@staticmethod
|
||||
def prefix(string):
|
||||
return string[0]
|
||||
|
||||
@staticmethod
|
||||
def suffix(string):
|
||||
return string[-3:]
|
||||
|
||||
@staticmethod
|
||||
def prob(string):
|
||||
return -30
|
||||
|
||||
@staticmethod
|
||||
def cluster(string):
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def is_alpha(string):
|
||||
return orth.is_alpha(string)
|
||||
|
||||
@staticmethod
|
||||
def is_ascii(string):
|
||||
return orth.is_ascii(string)
|
||||
|
||||
@staticmethod
|
||||
def is_digit(string):
|
||||
return string.isdigit()
|
||||
|
||||
@staticmethod
|
||||
def is_lower(string):
|
||||
return orth.is_lower(string)
|
||||
|
||||
@staticmethod
|
||||
def is_punct(string):
|
||||
return orth.is_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_space(string):
|
||||
return string.isspace()
|
||||
|
||||
@staticmethod
|
||||
def is_title(string):
|
||||
return orth.is_title(string)
|
||||
|
||||
@staticmethod
|
||||
def is_upper(string):
|
||||
return orth.is_upper(string)
|
||||
|
||||
@staticmethod
|
||||
def like_url(string):
|
||||
return orth.like_url(string)
|
||||
|
||||
@staticmethod
|
||||
def like_number(string):
|
||||
return orth.like_number(string)
|
||||
|
||||
@staticmethod
|
||||
def like_email(string):
|
||||
return orth.like_email(string)
|
||||
|
||||
@classmethod
|
||||
def default_lex_attrs(cls, data_dir=None):
|
||||
return {
|
||||
attrs.LOWER: cls.lower,
|
||||
attrs.NORM: cls.norm,
|
||||
attrs.SHAPE: cls.shape,
|
||||
attrs.PREFIX: cls.prefix,
|
||||
attrs.SUFFIX: cls.suffix,
|
||||
attrs.CLUSTER: cls.cluster,
|
||||
attrs.PROB: lambda string: -10.0,
|
||||
|
||||
attrs.IS_ALPHA: cls.is_alpha,
|
||||
attrs.IS_ASCII: cls.is_ascii,
|
||||
attrs.IS_DIGIT: cls.is_digit,
|
||||
attrs.IS_LOWER: cls.is_lower,
|
||||
attrs.IS_PUNCT: cls.is_punct,
|
||||
attrs.IS_SPACE: cls.is_space,
|
||||
attrs.IS_TITLE: cls.is_title,
|
||||
attrs.IS_UPPER: cls.is_upper,
|
||||
attrs.LIKE_URL: cls.like_url,
|
||||
attrs.LIKE_NUM: cls.like_number,
|
||||
attrs.LIKE_EMAIL: cls.like_email,
|
||||
attrs.IS_STOP: lambda string: False,
|
||||
attrs.IS_OOV: lambda string: True
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def default_dep_labels(cls):
|
||||
return {0: {'ROOT': True}}
|
||||
|
||||
@classmethod
|
||||
def default_ner_labels(cls):
|
||||
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||
|
||||
@classmethod
|
||||
def default_data_dir(cls):
|
||||
return path.join(path.dirname(__file__), 'data')
|
||||
|
||||
@classmethod
|
||||
def default_vectors(cls, data_dir):
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
|
||||
if data_dir is None:
|
||||
data_dir = cls.default_data_dir()
|
||||
if vectors is None:
|
||||
vectors = cls.default_vectors(data_dir)
|
||||
if get_lex_attr is None:
|
||||
get_lex_attr = cls.default_lex_attrs(data_dir)
|
||||
return Vocab.from_dir(
|
||||
path.join(data_dir, 'vocab'),
|
||||
get_lex_attr=get_lex_attr,
|
||||
vectors=vectors)
|
||||
|
||||
@classmethod
|
||||
def default_tokenizer(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Tokenizer.from_dir(vocab, data_dir)
|
||||
else:
|
||||
return Tokenizer(vocab, {}, None, None, None)
|
||||
|
||||
@classmethod
|
||||
def default_tagger(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Tagger.from_dir(data_dir, vocab)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def default_parser(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def default_entity(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
||||
else:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def default_matcher(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Matcher.from_dir(data_dir, vocab)
|
||||
else:
|
||||
return None
|
||||
|
||||
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
|
||||
parser=None, entity=None, matcher=None, serializer=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.default_data_dir()
|
||||
if vocab is None:
|
||||
vocab = self.default_vocab(data_dir)
|
||||
if tokenizer is None:
|
||||
tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
|
||||
if tagger is None:
|
||||
tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
|
||||
if entity is None:
|
||||
entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
|
||||
if parser is None:
|
||||
parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
|
||||
if matcher is None:
|
||||
matcher = self.default_matcher(vocab, data_dir=data_dir)
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.tagger = tagger
|
||||
self.parser = parser
|
||||
self.entity = entity
|
||||
self.matcher = matcher
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
|
||||
Returns:
|
||||
tokens (spacy.tokens.Doc):
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
tokens = self.tokenizer(text)
|
||||
if self.tagger and tag:
|
||||
self.tagger(tokens)
|
||||
if self.matcher and entity:
|
||||
self.matcher(tokens)
|
||||
if self.parser and parse:
|
||||
self.parser(tokens)
|
||||
if self.entity and entity:
|
||||
self.entity(tokens)
|
||||
return tokens
|
||||
|
||||
def end_training(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.data_dir
|
||||
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
|
||||
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
|
||||
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
|
||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||
file_.write(
|
||||
json.dumps([
|
||||
(TAG, list(self.tagger.freqs[TAG].items())),
|
||||
(DEP, list(self.parser.moves.freqs[DEP].items())),
|
||||
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
|
||||
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
|
||||
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))
|
86
spacy/lemmatizer.py
Normal file
86
spacy/lemmatizer.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
import codecs
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech import NOUN, VERB, ADJ
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir):
|
||||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
|
||||
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
|
||||
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
|
||||
return cls(index, exc, rules)
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, string, pos):
|
||||
if pos == NOUN:
|
||||
pos = 'noun'
|
||||
elif pos == VERB:
|
||||
pos = 'verb'
|
||||
elif pos == ADJ:
|
||||
pos = 'adj'
|
||||
else:
|
||||
return string
|
||||
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
|
||||
return min(lemmas)
|
||||
|
||||
def noun(self, string):
|
||||
return self(string, 'noun')
|
||||
|
||||
def verb(self, string):
|
||||
return self(string, 'verb')
|
||||
|
||||
def adj(self, string):
|
||||
return self(string, 'adj')
|
||||
|
||||
|
||||
def lemmatize(string, index, exceptions, rules):
|
||||
string = string.lower()
|
||||
forms = []
|
||||
if string in index:
|
||||
forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index:
|
||||
forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
return set(forms)
|
||||
|
||||
|
||||
def read_index(loc):
|
||||
index = set()
|
||||
for line in codecs.open(loc, 'r', 'utf8'):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
word = pieces[0]
|
||||
if word.count('_') == 0:
|
||||
index.add(word)
|
||||
return index
|
||||
|
||||
|
||||
def read_exc(loc):
|
||||
exceptions = {}
|
||||
for line in codecs.open(loc, 'r', 'utf8'):
|
||||
if line.startswith(' '):
|
||||
continue
|
||||
pieces = line.split()
|
||||
exceptions[pieces[0]] = tuple(pieces[1:])
|
||||
return exceptions
|
|
@ -17,6 +17,7 @@ cdef class Lexeme:
|
|||
cdef readonly attr_t orth
|
||||
|
||||
@staticmethod
|
||||
<<<<<<< HEAD
|
||||
cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
|
||||
cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
|
||||
self.c = lex
|
||||
|
@ -41,11 +42,30 @@ cdef class Lexeme:
|
|||
lex.suffix = value
|
||||
elif name == CLUSTER:
|
||||
lex.cluster = value
|
||||
=======
|
||||
cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = vocab.strings[props['orth']]
|
||||
lex.lower = vocab.strings[props['lower']]
|
||||
lex.norm = vocab.strings[props['norm']]
|
||||
lex.shape = vocab.strings[props['shape']]
|
||||
lex.prefix = vocab.strings[props['prefix']]
|
||||
lex.suffix = vocab.strings[props['suffix']]
|
||||
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
||||
lex.flags = props['flags']
|
||||
>>>>>>> de
|
||||
|
||||
@staticmethod
|
||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||
if feat_name < (sizeof(flags_t) * 8):
|
||||
return Lexeme.check_flag(lex, feat_name)
|
||||
if Lexeme.check_flag(lex, feat_name):
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == ORTH:
|
||||
|
@ -66,9 +86,29 @@ cdef class Lexeme:
|
|||
return lex.cluster
|
||||
else:
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||
if name < (sizeof(flags_t) * 8):
|
||||
Lexeme.set_flag(lex, name, value)
|
||||
elif name == ID:
|
||||
lex.id = value
|
||||
elif name == LOWER:
|
||||
lex.lower = value
|
||||
elif name == NORM:
|
||||
lex.norm = value
|
||||
elif name == SHAPE:
|
||||
lex.shape = value
|
||||
elif name == PREFIX:
|
||||
lex.prefix = value
|
||||
elif name == SUFFIX:
|
||||
lex.suffix = value
|
||||
elif name == CLUSTER:
|
||||
lex.cluster = value
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
<<<<<<< HEAD
|
||||
return lexeme.flags & (1 << flag_id)
|
||||
|
||||
@staticmethod
|
||||
|
@ -78,3 +118,17 @@ cdef class Lexeme:
|
|||
lexeme.flags |= one << flag_id
|
||||
else:
|
||||
lexeme.flags &= ~(one << flag_id)
|
||||
=======
|
||||
if lexeme.flags & (1 << flag_id):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
|
||||
cdef flags_t one = 1
|
||||
if value:
|
||||
lex.flags |= one << flag_id
|
||||
else:
|
||||
lex.flags &= ~(one << flag_id)
|
||||
>>>>>>> de
|
||||
|
|
|
@ -27,6 +27,17 @@ cdef class Lexeme:
|
|||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
assert self.c.orth == orth
|
||||
|
||||
def py_set_flag(self, attr_id_t flag_id):
|
||||
Lexeme.set_flag(self.c, flag_id, True)
|
||||
|
||||
def py_check_flag(self, attr_id_t flag_id):
|
||||
return True if Lexeme.check_flag(self.c, flag_id) else False
|
||||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
|
@ -48,9 +59,13 @@ cdef class Lexeme:
|
|||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
property cluster:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
|
||||
property prob:
|
||||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, int x): self.c.suffix = x
|
||||
|
||||
property lower_:
|
||||
def __get__(self): return self.vocab.strings[self.c.lower]
|
||||
|
@ -72,6 +87,10 @@ cdef class Lexeme:
|
|||
def __get__(self): return self.c.suffix
|
||||
def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
|
||||
|
||||
property flags:
|
||||
def __get__(self): return self.c.flags
|
||||
def __set__(self, flags_t x): self.c.flags = x
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return Lexeme.check_flag(self.c, IS_OOV)
|
||||
def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x)
|
||||
|
|
|
@ -8,6 +8,7 @@ from cymem.cymem cimport Pool
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab
|
||||
|
@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
|
|||
cdef int i
|
||||
for i in range(pattern.length):
|
||||
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
|
||||
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
|
||||
print get_token_attr(token, pattern.spec[i].attr)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store):
|
|||
attr = map_attr_name(attr)
|
||||
if isinstance(value, basestring):
|
||||
value = string_store[value]
|
||||
if isinstance(value, bool):
|
||||
value = int(value)
|
||||
converted[-1].append((attr, value))
|
||||
print "Converted", converted[-1]
|
||||
return converted
|
||||
|
||||
|
||||
|
@ -92,6 +98,32 @@ def map_attr_name(attr):
|
|||
return SHAPE
|
||||
elif attr == 'NORM':
|
||||
return NORM
|
||||
elif attr == 'FLAG13':
|
||||
return FLAG13
|
||||
elif attr == 'FLAG14':
|
||||
return FLAG14
|
||||
elif attr == 'FLAG15':
|
||||
return FLAG15
|
||||
elif attr == 'FLAG16':
|
||||
return FLAG16
|
||||
elif attr == 'FLAG17':
|
||||
return FLAG17
|
||||
elif attr == 'FLAG18':
|
||||
return FLAG18
|
||||
elif attr == 'FLAG19':
|
||||
return FLAG19
|
||||
elif attr == 'FLAG20':
|
||||
return FLAG20
|
||||
elif attr == 'FLAG21':
|
||||
return FLAG21
|
||||
elif attr == 'FLAG22':
|
||||
return FLAG22
|
||||
elif attr == 'FLAG23':
|
||||
return FLAG23
|
||||
elif attr == 'FLAG24':
|
||||
return FLAG24
|
||||
elif attr == 'FLAG25':
|
||||
return FLAG25
|
||||
else:
|
||||
raise Exception("TODO: Finish supporting attr mapping %s" % attr)
|
||||
|
||||
|
@ -99,14 +131,28 @@ def map_attr_name(attr):
|
|||
cdef class Matcher:
|
||||
cdef Pool mem
|
||||
cdef vector[Pattern*] patterns
|
||||
cdef readonly int n_patterns
|
||||
cdef readonly Vocab vocab
|
||||
|
||||
def __init__(self, vocab, patterns):
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
self.vocab = vocab
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add(entity_key, etype, attrs, specs)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, Vocab vocab):
|
||||
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
|
||||
if path.exists(patterns_loc):
|
||||
patterns_data = open(patterns_loc).read()
|
||||
patterns = json.loads(patterns_data)
|
||||
return cls(vocab, patterns)
|
||||
else:
|
||||
return cls(vocab, {})
|
||||
|
||||
property n_patterns:
|
||||
def __get__(self): return self.patterns.size()
|
||||
|
||||
def add(self, entity_key, etype, attrs, specs):
|
||||
if isinstance(entity_key, basestring):
|
||||
entity_key = self.vocab.strings[entity_key]
|
||||
|
@ -120,16 +166,6 @@ cdef class Matcher:
|
|||
spec = _convert_strings(spec, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, vocab, data_dir):
|
||||
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
|
||||
if path.exists(patterns_loc):
|
||||
patterns_data = open(patterns_loc).read()
|
||||
patterns = json.loads(patterns_data)
|
||||
return cls(vocab, patterns)
|
||||
else:
|
||||
return cls(vocab, {})
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
cdef vector[Pattern*] partials
|
||||
cdef int n_partials = 0
|
||||
|
@ -139,11 +175,13 @@ cdef class Matcher:
|
|||
cdef Pattern* state
|
||||
matches = []
|
||||
for token_i in range(doc.length):
|
||||
print 'check', doc[token_i].orth_
|
||||
token = &doc.data[token_i]
|
||||
q = 0
|
||||
for i in range(partials.size()):
|
||||
state = partials.at(i)
|
||||
if match(state, token):
|
||||
print 'match!'
|
||||
if is_final(state):
|
||||
matches.append(get_entity(state, token, token_i))
|
||||
else:
|
||||
|
@ -153,6 +191,7 @@ cdef class Matcher:
|
|||
for i in range(self.n_patterns):
|
||||
state = self.patterns[i]
|
||||
if match(state, token):
|
||||
print 'match!'
|
||||
if is_final(state):
|
||||
matches.append(get_entity(state, token, token_i))
|
||||
else:
|
||||
|
|
|
@ -1,4 +1,755 @@
|
|||
from .structs cimport TokenC, Morphology, PosTag
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMapArray
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .strings cimport StringStore
|
||||
from .typedefs cimport attr_t
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
||||
cdef struct RichTagC:
|
||||
uint64_t morph
|
||||
int id
|
||||
univ_pos_t pos
|
||||
attr_t name
|
||||
|
||||
|
||||
cdef struct MorphAnalysisC:
|
||||
RichTagC tag
|
||||
attr_t lemma
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef public object lemmatizer
|
||||
cdef public object n_tags
|
||||
cdef public object reverse_index
|
||||
cdef public object tag_names
|
||||
|
||||
cdef RichTagC* rich_tags
|
||||
cdef PreshMapArray _cache
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1
|
||||
|
||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
|
||||
|
||||
|
||||
|
||||
#
|
||||
#cpdef enum Feature_t:
|
||||
# Abbr
|
||||
# AdpType
|
||||
# AdvType
|
||||
# ConjType
|
||||
# Connegative
|
||||
# Derivation
|
||||
# Echo
|
||||
# Foreign
|
||||
# Gender_dat
|
||||
# Gender_erg
|
||||
# Gender_psor
|
||||
# Hyph
|
||||
# InfForm
|
||||
# NameType
|
||||
# NounType
|
||||
# NumberAbs
|
||||
# NumberDat
|
||||
# NumberErg
|
||||
# NumberPsee
|
||||
# NumberPsor
|
||||
# NumForm
|
||||
# NumValue
|
||||
# PartForm
|
||||
# PartType
|
||||
# Person_abs
|
||||
# Person_dat
|
||||
# Person_psor
|
||||
# Polite
|
||||
# Polite_abs
|
||||
# Polite_dat
|
||||
# Prefix
|
||||
# PrepCase
|
||||
# PunctSide
|
||||
# PunctType
|
||||
# Style
|
||||
# Typo
|
||||
# Variant
|
||||
# VerbType
|
||||
#
|
||||
#
|
||||
#cpdef enum Animacy:
|
||||
# Anim
|
||||
# Inam
|
||||
#
|
||||
#
|
||||
#cpdef enum Aspect:
|
||||
# Freq
|
||||
# Imp
|
||||
# Mod
|
||||
# None_
|
||||
# Perf
|
||||
#
|
||||
#
|
||||
#cpdef enum Case1:
|
||||
# Nom
|
||||
# Gen
|
||||
# Acc
|
||||
# Dat
|
||||
# Voc
|
||||
# Abl
|
||||
#
|
||||
#cdef enum Case2:
|
||||
# Abe
|
||||
# Abs
|
||||
# Ade
|
||||
# All
|
||||
# Cau
|
||||
# Com
|
||||
# Del
|
||||
# Dis
|
||||
#
|
||||
#cdef enum Case3:
|
||||
# Ela
|
||||
# Ess
|
||||
# Ill
|
||||
# Ine
|
||||
# Ins
|
||||
# Loc
|
||||
# Lat
|
||||
# Par
|
||||
#
|
||||
#cdef enum Case4:
|
||||
# Sub
|
||||
# Sup
|
||||
# Tem
|
||||
# Ter
|
||||
# Tra
|
||||
#
|
||||
#
|
||||
#cpdef enum Definite:
|
||||
# Two
|
||||
# Def
|
||||
# Red
|
||||
# Ind
|
||||
#
|
||||
#
|
||||
#cpdef enum Degree:
|
||||
# Cmp
|
||||
# Comp
|
||||
# None_
|
||||
# Pos
|
||||
# Sup
|
||||
# Abs
|
||||
# Com
|
||||
# Degree # du
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender:
|
||||
# Com
|
||||
# Fem
|
||||
# Masc
|
||||
# Neut
|
||||
#
|
||||
#
|
||||
#cpdef enum Mood:
|
||||
# Cnd
|
||||
# Imp
|
||||
# Ind
|
||||
# N
|
||||
# Pot
|
||||
# Sub
|
||||
# Opt
|
||||
#
|
||||
#
|
||||
#cpdef enum Negative:
|
||||
# Neg
|
||||
# Pos
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum Number:
|
||||
# Com
|
||||
# Dual
|
||||
# None_
|
||||
# Plur
|
||||
# Sing
|
||||
# Ptan # bg
|
||||
# Count # bg
|
||||
#
|
||||
#
|
||||
#cpdef enum NumType:
|
||||
# Card
|
||||
# Dist
|
||||
# Frac
|
||||
# Gen
|
||||
# Mult
|
||||
# None_
|
||||
# Ord
|
||||
# Sets
|
||||
#
|
||||
#
|
||||
#cpdef enum Person:
|
||||
# One
|
||||
# Two
|
||||
# Three
|
||||
# None_
|
||||
#
|
||||
#
|
||||
#cpdef enum Poss:
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum PronType1:
|
||||
# AdvPart
|
||||
# Art
|
||||
# Default
|
||||
# Dem
|
||||
# Ind
|
||||
# Int
|
||||
# Neg
|
||||
#
|
||||
#cpdef enum PronType2:
|
||||
# Prs
|
||||
# Rcp
|
||||
# Rel
|
||||
# Tot
|
||||
# Clit
|
||||
# Exc # es, ca, it, fa
|
||||
# Clit # it
|
||||
#
|
||||
#
|
||||
#cpdef enum Reflex:
|
||||
# Yes
|
||||
#
|
||||
#
|
||||
#cpdef enum Tense:
|
||||
# Fut
|
||||
# Imp
|
||||
# Past
|
||||
# Pres
|
||||
#
|
||||
#cpdef enum VerbForm1:
|
||||
# Fin
|
||||
# Ger
|
||||
# Inf
|
||||
# None_
|
||||
# Part
|
||||
# PartFut
|
||||
# PartPast
|
||||
#
|
||||
#cpdef enum VerbForm2:
|
||||
# PartPres
|
||||
# Sup
|
||||
# Trans
|
||||
# Gdv # la
|
||||
#
|
||||
#
|
||||
#cpdef enum Voice:
|
||||
# Act
|
||||
# Cau
|
||||
# Pass
|
||||
# Mid # gkc
|
||||
# Int # hb
|
||||
#
|
||||
#
|
||||
#cpdef enum Abbr:
|
||||
# Yes # cz, fi, sl, U
|
||||
#
|
||||
#cpdef enum AdpType:
|
||||
# Prep # cz, U
|
||||
# Post # U
|
||||
# Voc # cz
|
||||
# Comprep # cz
|
||||
# Circ # U
|
||||
# Voc # U
|
||||
#
|
||||
#
|
||||
#cpdef enum AdvType1:
|
||||
# # U
|
||||
# Man
|
||||
# Loc
|
||||
# Tim
|
||||
# Deg
|
||||
# Cau
|
||||
# Mod
|
||||
# Sta
|
||||
# Ex
|
||||
#
|
||||
#cpdef enum AdvType2:
|
||||
# Adadj
|
||||
#
|
||||
#cpdef enum ConjType:
|
||||
# Oper # cz, U
|
||||
# Comp # cz, U
|
||||
#
|
||||
#cpdef enum Connegative:
|
||||
# Yes # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum Derivation1:
|
||||
# Minen # fi
|
||||
# Sti # fi
|
||||
# Inen # fi
|
||||
# Lainen # fi
|
||||
# Ja # fi
|
||||
# Ton # fi
|
||||
# Vs # fi
|
||||
# Ttain # fi
|
||||
#
|
||||
#cpdef enum Derivation2:
|
||||
# Ttaa
|
||||
#
|
||||
#
|
||||
#cpdef enum Echo:
|
||||
# Rdp # U
|
||||
# Ech # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Foreign:
|
||||
# Foreign # cz, fi, U
|
||||
# Fscript # cz, fi, U
|
||||
# Tscript # cz, U
|
||||
# Yes # sl
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_dat:
|
||||
# Masc # bq, U
|
||||
# Fem # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_erg:
|
||||
# Masc # bq
|
||||
# Fem # bq
|
||||
#
|
||||
#
|
||||
#cpdef enum Gender_psor:
|
||||
# Masc # cz, sl, U
|
||||
# Fem # cz, sl, U
|
||||
# Neut # sl
|
||||
#
|
||||
#
|
||||
#cpdef enum Hyph:
|
||||
# Yes # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum InfForm:
|
||||
# One # fi
|
||||
# Two # fi
|
||||
# Three # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum NameType:
|
||||
# Geo # U, cz
|
||||
# Prs # U, cz
|
||||
# Giv # U, cz
|
||||
# Sur # U, cz
|
||||
# Nat # U, cz
|
||||
# Com # U, cz
|
||||
# Pro # U, cz
|
||||
# Oth # U, cz
|
||||
#
|
||||
#
|
||||
#cpdef enum NounType:
|
||||
# Com # U
|
||||
# Prop # U
|
||||
# Class # U
|
||||
#
|
||||
#cpdef enum Number_abs:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_dat:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_erg:
|
||||
# Sing # bq, U
|
||||
# Plur # bq, U
|
||||
#
|
||||
#cpdef enum Number_psee:
|
||||
# Sing # U
|
||||
# Plur # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Number_psor:
|
||||
# Sing # cz, fi, sl, U
|
||||
# Plur # cz, fi, sl, U
|
||||
#
|
||||
#
|
||||
#cpdef enum NumForm:
|
||||
# Digit # cz, sl, U
|
||||
# Roman # cz, sl, U
|
||||
# Word # cz, sl, U
|
||||
#
|
||||
#
|
||||
#cpdef enum NumValue:
|
||||
# One # cz, U
|
||||
# Two # cz, U
|
||||
# Three # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum PartForm:
|
||||
# Pres # fi
|
||||
# Past # fi
|
||||
# Agt # fi
|
||||
# Neg # fi
|
||||
#
|
||||
#
|
||||
#cpdef enum PartType:
|
||||
# Mod # U
|
||||
# Emp # U
|
||||
# Res # U
|
||||
# Inf # U
|
||||
# Vbp # U
|
||||
#
|
||||
#cpdef enum Person_abs:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_dat:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_erg:
|
||||
# One # bq, U
|
||||
# Two # bq, U
|
||||
# Three # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Person_psor:
|
||||
# One # fi, U
|
||||
# Two # fi, U
|
||||
# Three # fi, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_abs:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_erg:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Polite_dat:
|
||||
# Inf # bq, U
|
||||
# Pol # bq, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Prefix:
|
||||
# Yes # U
|
||||
#
|
||||
#
|
||||
#cpdef enum PrepCase:
|
||||
# Npr # cz
|
||||
# Pre # U
|
||||
#
|
||||
#
|
||||
#cpdef enum PunctSide:
|
||||
# Ini # U
|
||||
# Fin # U
|
||||
#
|
||||
#cpdef enum PunctType1:
|
||||
# Peri # U
|
||||
# Qest # U
|
||||
# Excl # U
|
||||
# Quot # U
|
||||
# Brck # U
|
||||
# Comm # U
|
||||
# Colo # U
|
||||
# Semi # U
|
||||
#
|
||||
#cpdef enum PunctType2:
|
||||
# Dash # U
|
||||
#
|
||||
#
|
||||
#cpdef enum Style1:
|
||||
# Arch # cz, fi, U
|
||||
# Rare # cz, fi, U
|
||||
# Poet # cz, U
|
||||
# Norm # cz, U
|
||||
# Coll # cz, U
|
||||
# Vrnc # cz, U
|
||||
# Sing # cz, U
|
||||
# Expr # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Style2:
|
||||
# Derg # cz, U
|
||||
# Vulg # cz, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Typo:
|
||||
# Yes # fi, U
|
||||
#
|
||||
#
|
||||
#cpdef enum Variant:
|
||||
# Short # cz
|
||||
# Bound # cz, sl
|
||||
#
|
||||
#
|
||||
#cpdef enum VerbType:
|
||||
# Aux # U
|
||||
# Cop # U
|
||||
# Mod # U
|
||||
# Light # U
|
||||
#
|
||||
|
||||
cpdef enum Value_t:
|
||||
Animacy_Anim
|
||||
Animacy_Inam
|
||||
Aspect_Freq
|
||||
Aspect_Imp
|
||||
Aspect_Mod
|
||||
Aspect_None_
|
||||
Aspect_Perf
|
||||
Case_Abe
|
||||
Case_Abl
|
||||
Case_Abs
|
||||
Case_Acc
|
||||
Case_Ade
|
||||
Case_All
|
||||
Case_Cau
|
||||
Case_Com
|
||||
Case_Dat
|
||||
Case_Del
|
||||
Case_Dis
|
||||
Case_Ela
|
||||
Case_Ess
|
||||
Case_Gen
|
||||
Case_Ill
|
||||
Case_Ine
|
||||
Case_Ins
|
||||
Case_Loc
|
||||
Case_Lat
|
||||
Case_Nom
|
||||
Case_Par
|
||||
Case_Sub
|
||||
Case_Sup
|
||||
Case_Tem
|
||||
Case_Ter
|
||||
Case_Tra
|
||||
Case_Voc
|
||||
Definite_Two
|
||||
Definite_Def
|
||||
Definite_Red
|
||||
Definite_Ind
|
||||
Degree_Cmp
|
||||
Degree_Comp
|
||||
Degree_None
|
||||
Degree_Pos
|
||||
Degree_Sup
|
||||
Degree_Abs
|
||||
Degree_Com
|
||||
Degree_Dim # du
|
||||
Gender_Com
|
||||
Gender_Fem
|
||||
Gender_Masc
|
||||
Gender_Neut
|
||||
Mood_Cnd
|
||||
Mood_Imp
|
||||
Mood_Ind
|
||||
Mood_N
|
||||
Mood_Pot
|
||||
Mood_Sub
|
||||
Mood_Opt
|
||||
Negative_Neg
|
||||
Negative_Pos
|
||||
Negative_Yes
|
||||
Number_Com
|
||||
Number_Dual
|
||||
Number_None
|
||||
Number_Plur
|
||||
Number_Sing
|
||||
Number_Ptan # bg
|
||||
Number_Count # bg
|
||||
NumType_Card
|
||||
NumType_Dist
|
||||
NumType_Frac
|
||||
NumType_Gen
|
||||
NumType_Mult
|
||||
NumType_None
|
||||
NumType_Ord
|
||||
NumType_Sets
|
||||
Person_One
|
||||
Person_Two
|
||||
Person_Three
|
||||
Person_None
|
||||
Poss_Yes
|
||||
PronType_AdvPart
|
||||
PronType_Art
|
||||
PronType_Default
|
||||
PronType_Dem
|
||||
PronType_Ind
|
||||
PronType_Int
|
||||
PronType_Neg
|
||||
PronType_Prs
|
||||
PronType_Rcp
|
||||
PronType_Rel
|
||||
PronType_Tot
|
||||
PronType_Clit
|
||||
PronType_Exc # es, ca, it, fa
|
||||
Reflex_Yes
|
||||
Tense_Fut
|
||||
Tense_Imp
|
||||
Tense_Past
|
||||
Tense_Pres
|
||||
VerbForm_Fin
|
||||
VerbForm_Ger
|
||||
VerbForm_Inf
|
||||
VerbForm_None
|
||||
VerbForm_Part
|
||||
VerbForm_PartFut
|
||||
VerbForm_PartPast
|
||||
VerbForm_PartPres
|
||||
VerbForm_Sup
|
||||
VerbForm_Trans
|
||||
VerbForm_Gdv # la
|
||||
Voice_Act
|
||||
Voice_Cau
|
||||
Voice_Pass
|
||||
Voice_Mid # gkc
|
||||
Voice_Int # hb
|
||||
Abbr_Yes # cz, fi, sl, U
|
||||
AdpType_Prep # cz, U
|
||||
AdpType_Post # U
|
||||
AdpType_Voc # cz
|
||||
AdpType_Comprep # cz
|
||||
AdpType_Circ # U
|
||||
AdvType_Man
|
||||
AdvType_Loc
|
||||
AdvType_Tim
|
||||
AdvType_Deg
|
||||
AdvType_Cau
|
||||
AdvType_Mod
|
||||
AdvType_Sta
|
||||
AdvType_Ex
|
||||
AdvType_Adadj
|
||||
ConjType_Oper # cz, U
|
||||
ConjType_Comp # cz, U
|
||||
Connegative_Yes # fi
|
||||
Derivation_Minen # fi
|
||||
Derivation_Sti # fi
|
||||
Derivation_Inen # fi
|
||||
Derivation_Lainen # fi
|
||||
Derivation_Ja # fi
|
||||
Derivation_Ton # fi
|
||||
Derivation_Vs # fi
|
||||
Derivation_Ttain # fi
|
||||
Derivation_Ttaa # fi
|
||||
Echo_Rdp # U
|
||||
Echo_Ech # U
|
||||
Foreign_Foreign # cz, fi, U
|
||||
Foreign_Fscript # cz, fi, U
|
||||
Foreign_Tscript # cz, U
|
||||
Foreign_Yes # sl
|
||||
Gender_dat_Masc # bq, U
|
||||
Gender_dat_Fem # bq, U
|
||||
Gender_erg_Masc # bq
|
||||
Gender_erg_Fem # bq
|
||||
Gender_psor_Masc # cz, sl, U
|
||||
Gender_psor_Fem # cz, sl, U
|
||||
Gender_psor_Neut # sl
|
||||
Hyph_Yes # cz, U
|
||||
InfForm_One # fi
|
||||
InfForm_Two # fi
|
||||
InfForm_Three # fi
|
||||
NameType_Geo # U, cz
|
||||
NameType_Prs # U, cz
|
||||
NameType_Giv # U, cz
|
||||
NameType_Sur # U, cz
|
||||
NameType_Nat # U, cz
|
||||
NameType_Com # U, cz
|
||||
NameType_Pro # U, cz
|
||||
NameType_Oth # U, cz
|
||||
NounType_Com # U
|
||||
NounType_Prop # U
|
||||
NounType_Class # U
|
||||
Number_abs_Sing # bq, U
|
||||
Number_abs_Plur # bq, U
|
||||
Number_dat_Sing # bq, U
|
||||
Number_dat_Plur # bq, U
|
||||
Number_erg_Sing # bq, U
|
||||
Number_erg_Plur # bq, U
|
||||
Number_psee_Sing # U
|
||||
Number_psee_Plur # U
|
||||
Number_psor_Sing # cz, fi, sl, U
|
||||
Number_psor_Plur # cz, fi, sl, U
|
||||
NumForm_Digit # cz, sl, U
|
||||
NumForm_Roman # cz, sl, U
|
||||
NumForm_Word # cz, sl, U
|
||||
NumValue_One # cz, U
|
||||
NumValue_Two # cz, U
|
||||
NumValue_Three # cz, U
|
||||
PartForm_Pres # fi
|
||||
PartForm_Past # fi
|
||||
PartForm_Agt # fi
|
||||
PartForm_Neg # fi
|
||||
PartType_Mod # U
|
||||
PartType_Emp # U
|
||||
PartType_Res # U
|
||||
PartType_Inf # U
|
||||
PartType_Vbp # U
|
||||
Person_abs_One # bq, U
|
||||
Person_abs_Two # bq, U
|
||||
Person_abs_Three # bq, U
|
||||
Person_dat_One # bq, U
|
||||
Person_dat_Two # bq, U
|
||||
Person_dat_Three # bq, U
|
||||
Person_erg_One # bq, U
|
||||
Person_erg_Two # bq, U
|
||||
Person_erg_Three # bq, U
|
||||
Person_psor_One # fi, U
|
||||
Person_psor_Two # fi, U
|
||||
Person_psor_Three # fi, U
|
||||
Polite_Inf # bq, U
|
||||
Polite_Pol # bq, U
|
||||
Polite_abs_Inf # bq, U
|
||||
Polite_abs_Pol # bq, U
|
||||
Polite_erg_Inf # bq, U
|
||||
Polite_erg_Pol # bq, U
|
||||
Polite_dat_Inf # bq, U
|
||||
Polite_dat_Pol # bq, U
|
||||
Prefix_Yes # U
|
||||
PrepCase_Npr # cz
|
||||
PrepCase_Pre # U
|
||||
PunctSide_Ini # U
|
||||
PunctSide_Fin # U
|
||||
PunctType_Peri # U
|
||||
PunctType_Qest # U
|
||||
PunctType_Excl # U
|
||||
PunctType_Quot # U
|
||||
PunctType_Brck # U
|
||||
PunctType_Comm # U
|
||||
PunctType_Colo # U
|
||||
PunctType_Semi # U
|
||||
PunctType_Dash # U
|
||||
Style_Arch # cz, fi, U
|
||||
Style_Rare # cz, fi, U
|
||||
Style_Poet # cz, U
|
||||
Style_Norm # cz, U
|
||||
Style_Coll # cz, U
|
||||
Style_Vrnc # cz, U
|
||||
Style_Sing # cz, U
|
||||
Style_Expr # cz, U
|
||||
Style_Derg # cz, U
|
||||
Style_Vulg # cz, U
|
||||
Style_Yes # fi, U
|
||||
StyleVariant_StyleShort # cz
|
||||
StyleVariant_StyleBound # cz, sl
|
||||
VerbType_Aux # U
|
||||
VerbType_Cop # U
|
||||
VerbType_Mod # U
|
||||
VerbType_Light # U
|
||||
|
|
|
@ -1,11 +1,89 @@
|
|||
# cython: embedsignature=True
|
||||
from os import path
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech import UNIV_POS_NAMES
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
morph.tenspect = props.get('tenspect', 0)
|
||||
morph.mood = props.get('mood', 0)
|
||||
morph.gender = props.get('gender', 0)
|
||||
morph.person = props.get('person', 0)
|
||||
morph.case = props.get('case', 0)
|
||||
morph.misc = props.get('misc', 0)
|
||||
cdef class Morphology:
|
||||
def __init__(self, StringStore string_store, tag_map, lemmatizer):
|
||||
self.mem = Pool()
|
||||
self.strings = string_store
|
||||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map) + 1
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
self.reverse_index = {}
|
||||
|
||||
self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
|
||||
for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].morph = 0
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
self._cache = PreshMapArray(self.n_tags)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
cdef int tag_id
|
||||
if isinstance(tag, basestring):
|
||||
try:
|
||||
tag_id = self.reverse_index[self.strings[tag]]
|
||||
except KeyError:
|
||||
print tag
|
||||
raise
|
||||
else:
|
||||
tag_id = tag
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
analysis.tag = self.rich_tags[tag_id]
|
||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
||||
token.lemma = analysis.lemma
|
||||
token.pos = analysis.tag.pos
|
||||
token.tag = analysis.tag.name
|
||||
token.morph = analysis.tag.morph
|
||||
|
||||
cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
|
||||
pass
|
||||
|
||||
def load_morph_exceptions(self, dict exc):
|
||||
# Map (form, pos) to (lemma, rich tag)
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef attr_t orth
|
||||
cdef int pos
|
||||
for tag_str, entries in exc.items():
|
||||
tag = self.strings[tag_str]
|
||||
rich_tag = self.rich_tags[self.reverse_index[tag]]
|
||||
for form_str, props in entries.items():
|
||||
cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
orth = self.strings[form_str]
|
||||
for name_str, value_str in props.items():
|
||||
if name_str == 'L':
|
||||
cached.lemma = self.strings[value_str]
|
||||
else:
|
||||
self.assign_feature(&cached.tag.morph, name_str, value_str)
|
||||
if cached.lemma == 0:
|
||||
cached.lemma = self.lemmatize(rich_tag.pos, orth)
|
||||
self._cache.set(rich_tag.pos, orth, <void*>cached)
|
||||
|
||||
def lemmatize(self, const univ_pos_t pos, attr_t orth):
|
||||
if self.lemmatizer is None:
|
||||
return orth
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return orth
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings[lemma_string]
|
||||
return lemma
|
||||
|
|
|
@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
|||
cpdef bint like_url(unicode string):
|
||||
# We're looking for things that function in text like URLs. So, valid URL
|
||||
# or not, anything they say http:// is going to be good.
|
||||
if string.startswith('http://'):
|
||||
if string.startswith('http://') or string.startswith('https://'):
|
||||
return True
|
||||
elif string.startswith('www.') and len(string) >= 5:
|
||||
return True
|
||||
|
@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
|
|||
return False
|
||||
|
||||
|
||||
# TODO: This should live in the language.orth
|
||||
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
|
||||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||
|
|
|
@ -2,17 +2,22 @@
|
|||
cpdef enum univ_pos_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
NUM
|
||||
PART
|
||||
PRON
|
||||
PRT
|
||||
PROPN
|
||||
PUNCT
|
||||
SCONJ
|
||||
SYM
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
SPACE
|
||||
N_UNIV_TAGS
|
||||
|
|
|
@ -4,17 +4,22 @@ from __future__ import unicode_literals
|
|||
UNIV_POS_NAMES = {
|
||||
"NO_TAG": NO_TAG,
|
||||
"ADJ": ADJ,
|
||||
"ADV": ADV,
|
||||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ,
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
"NOUN": NOUN,
|
||||
"NUM": NUM,
|
||||
"PART": PART,
|
||||
"PRON": PRON,
|
||||
"PRT": PRT,
|
||||
"PROPN": PROPN,
|
||||
"PUNCT": PUNCT,
|
||||
"SCONJ": SCONJ,
|
||||
"SYM": SYM,
|
||||
"VERB": VERB,
|
||||
"X": X,
|
||||
"PUNCT": PUNCT,
|
||||
"SPACE": SPACE,
|
||||
"EOL": EOL
|
||||
"EOL": EOL,
|
||||
"SPACE": SPACE
|
||||
}
|
||||
|
|
|
@ -142,6 +142,8 @@ cdef class StringStore:
|
|||
def load(self, loc):
|
||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
||||
strings = file_.read().split(SEPARATOR)
|
||||
if strings == ['']:
|
||||
return None
|
||||
cdef unicode string
|
||||
cdef bytes byte_string
|
||||
for string in strings:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from libc.stdint cimport uint8_t, uint32_t, int32_t
|
||||
from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
|
||||
|
||||
from .typedefs cimport flags_t, attr_t, hash_t
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
|
@ -26,22 +26,6 @@ cdef struct LexemeC:
|
|||
float l2_norm
|
||||
|
||||
|
||||
cdef struct Morphology:
|
||||
uint8_t number
|
||||
uint8_t tenspect # Tense/aspect/voice
|
||||
uint8_t mood
|
||||
uint8_t gender
|
||||
uint8_t person
|
||||
uint8_t case
|
||||
uint8_t misc
|
||||
|
||||
|
||||
cdef struct PosTag:
|
||||
Morphology morph
|
||||
int id
|
||||
univ_pos_t pos
|
||||
|
||||
|
||||
cdef struct Entity:
|
||||
int start
|
||||
int end
|
||||
|
@ -59,8 +43,8 @@ cdef struct Constituent:
|
|||
|
||||
cdef struct TokenC:
|
||||
const LexemeC* lex
|
||||
Morphology morph
|
||||
const Constituent* ctnt
|
||||
uint64_t morph
|
||||
univ_pos_t pos
|
||||
bint spacy
|
||||
int tag
|
||||
|
|
|
@ -11,7 +11,6 @@ from .stateclass cimport StateClass
|
|||
|
||||
|
||||
cdef class Parser:
|
||||
cdef readonly object cfg
|
||||
cdef readonly Model model
|
||||
cdef readonly TransitionSystem moves
|
||||
|
||||
|
|
|
@ -67,16 +67,22 @@ def ParserFactory(transition_system):
|
|||
|
||||
|
||||
cdef class Parser:
|
||||
def __init__(self, StringStore strings, model_dir, transition_system):
|
||||
def __init__(self, StringStore strings, transition_system, model):
|
||||
self.moves = transition_system
|
||||
self.model = model
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, model_dir, strings, transition_system):
|
||||
if not os.path.exists(model_dir):
|
||||
print >> sys.stderr, "Warning: No model found at", model_dir
|
||||
elif not os.path.isdir(model_dir):
|
||||
print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
|
||||
else:
|
||||
self.cfg = Config.read(model_dir, 'config')
|
||||
self.moves = transition_system(strings, self.cfg.labels)
|
||||
templates = get_templates(self.cfg.features)
|
||||
self.model = Model(self.moves.n_moves, templates, model_dir)
|
||||
cfg = Config.read(model_dir, 'config')
|
||||
moves = transition_system(strings, cfg.labels)
|
||||
templates = get_templates(cfg.features)
|
||||
model = Model(moves.n_moves, templates, model_dir)
|
||||
return cls(strings, moves, model)
|
||||
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
|
||||
|
|
12
spacy/tagger.pxd
Normal file
12
spacy/tagger.pxd
Normal file
|
@ -0,0 +1,12 @@
|
|||
from ._ml cimport Model
|
||||
from .structs cimport TokenC
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cdef readonly Vocab vocab
|
||||
cdef readonly Model model
|
||||
cdef public dict freqs
|
||||
|
||||
cdef int predict(self, int i, const TokenC* tokens) except -1
|
||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
|
220
spacy/tagger.pyx
Normal file
220
spacy/tagger.pyx
Normal file
|
@ -0,0 +1,220 @@
|
|||
import json
|
||||
from os import path
|
||||
from collections import defaultdict
|
||||
|
||||
from thinc.typedefs cimport atom_t, weight_t
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||
|
||||
from .attrs cimport *
|
||||
from ._ml cimport arg_max
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_orth
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_lemma
|
||||
P2_flags
|
||||
|
||||
P1_orth
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_lemma
|
||||
P1_flags
|
||||
|
||||
W_orth
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_lemma
|
||||
W_flags
|
||||
|
||||
N1_orth
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_lemma
|
||||
N1_flags
|
||||
|
||||
N2_orth
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_lemma
|
||||
N2_flags
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
"""A part-of-speech tagger for English"""
|
||||
@classmethod
|
||||
def read_config(cls, data_dir):
|
||||
return json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
|
||||
@classmethod
|
||||
def default_templates(cls):
|
||||
return (
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
|
||||
(W_flags,),
|
||||
(N1_flags,),
|
||||
(N2_flags,),
|
||||
(P1_flags,),
|
||||
(P2_flags,),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def blank(cls, vocab, templates):
|
||||
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, vocab):
|
||||
if path.exists(path.join(data_dir, 'templates.json')):
|
||||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||
else:
|
||||
templates = cls.default_templates()
|
||||
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||
return cls(vocab, model)
|
||||
|
||||
def __init__(self, Vocab vocab, model):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
||||
# TODO: Move this to tag map
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.vocab.morphology.tag_names
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Args:
|
||||
tokens (Doc): The tokens to be tagged.
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
cdef int i
|
||||
cdef const weight_t* scores
|
||||
for i in range(tokens.length):
|
||||
if tokens.data[i].pos == 0:
|
||||
guess = self.predict(i, tokens.data)
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
cdef int i
|
||||
cdef int loss
|
||||
cdef const weight_t* scores
|
||||
try:
|
||||
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
[g for g in gold_tag_strs if g is not None and g not in self.tag_names])
|
||||
correct = 0
|
||||
for i in range(tokens.length):
|
||||
guess = self.update(i, tokens.data, golds[i])
|
||||
loss = golds[i] != -1 and guess != golds[i]
|
||||
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
|
||||
correct += loss == 0
|
||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||
return correct
|
||||
|
||||
cdef int predict(self, int i, const TokenC* tokens) except -1:
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&context[W_orth], &tokens[i])
|
||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
||||
scores = self.model.score(context)
|
||||
return arg_max(scores, self.model.n_classes)
|
||||
|
||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&context[W_orth], &tokens[i])
|
||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
||||
scores = self.model.score(context)
|
||||
guess = arg_max(scores, self.model.n_classes)
|
||||
loss = guess != gold if gold != -1 else 0
|
||||
self.model.update(context, guess, gold, loss)
|
||||
return guess
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.lower
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.tag
|
||||
context[6] = t.lemma
|
||||
if t.lex.flags & (1 << IS_ALPHA):
|
||||
context[7] = 1
|
||||
elif t.lex.flags & (1 << IS_PUNCT):
|
||||
context[7] = 2
|
||||
elif t.lex.flags & (1 << LIKE_URL):
|
||||
context[7] = 3
|
||||
elif t.lex.flags & (1 << LIKE_NUM):
|
||||
context[7] = 4
|
||||
else:
|
||||
context[7] = 0
|
|
@ -4,15 +4,10 @@ from preshed.maps cimport PreshMap
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport LexemeC, TokenC, Morphology
|
||||
from .structs cimport LexemeC, TokenC
|
||||
from .strings cimport StringStore
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab, _Cached
|
||||
|
||||
|
||||
cdef union LexemesOrTokens:
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
|
|
@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from .morphology cimport set_morph_from_dict
|
||||
from .strings cimport hash_string
|
||||
cimport cython
|
||||
|
||||
|
@ -29,7 +28,7 @@ cdef class Tokenizer:
|
|||
self._suffix_re = suffix_re
|
||||
self._infix_re = infix_re
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules, self.vocab.pos_tags)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, data_dir):
|
||||
|
@ -193,9 +192,7 @@ cdef class Tokenizer:
|
|||
tokens.push_back(prefixes[0][i], False)
|
||||
if string:
|
||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if cache_hit:
|
||||
pass
|
||||
else:
|
||||
if not cache_hit:
|
||||
match = self.find_infix(string)
|
||||
if match is None:
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
|
@ -242,7 +239,7 @@ cdef class Tokenizer:
|
|||
match = self._suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, object rules, object tag_map):
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
|
@ -253,29 +250,11 @@ cdef class Tokenizer:
|
|||
cdef dict props
|
||||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
form = props['F']
|
||||
lemma = props.get("L", None)
|
||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
||||
if lemma is not None:
|
||||
tokens[i].lemma = self.vocab.strings[lemma]
|
||||
else:
|
||||
tokens[i].lemma = 0
|
||||
if 'pos' in props:
|
||||
tokens[i].tag = self.vocab.strings[props['pos']]
|
||||
tokens[i].pos = tag_map[props['pos']][0]
|
||||
# These are defaults, which can be over-ridden by the
|
||||
# token-specific props.
|
||||
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
||||
if tokens[i].lemma == 0:
|
||||
tokens[i].lemma = tokens[i].lex.orth
|
||||
set_morph_from_dict(&tokens[i].morph, props)
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
cached.data.tokens = tokens
|
||||
hashed = hash_string(chunk)
|
||||
self._specials.set(hashed, cached)
|
||||
self._cache.set(hashed, cached)
|
||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||
key = hash_string(chunk)
|
||||
self._specials.set(key, cached)
|
||||
self._cache.set(key, cached)
|
||||
|
|
|
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
|
|||
|
||||
|
||||
ctypedef const LexemeC* const_Lexeme_ptr
|
||||
ctypedef TokenC* TokenC_ptr
|
||||
ctypedef const TokenC* const_TokenC_ptr
|
||||
|
||||
ctypedef fused LexemeOrToken:
|
||||
const_Lexeme_ptr
|
||||
TokenC_ptr
|
||||
const_TokenC_ptr
|
||||
|
||||
|
||||
cdef class Doc:
|
||||
|
|
|
@ -14,6 +14,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
|||
from ..parts_of_speech import UNIV_POS_NAMES
|
||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..lexeme cimport Lexeme
|
||||
from .spans cimport Span
|
||||
from .token cimport Token
|
||||
from ..serialize.bits cimport BitArray
|
||||
|
@ -210,7 +211,7 @@ cdef class Doc:
|
|||
if self.length == self.max_length:
|
||||
self._realloc(self.length * 2)
|
||||
cdef TokenC* t = &self.data[self.length]
|
||||
if LexemeOrToken is TokenC_ptr:
|
||||
if LexemeOrToken is const_TokenC_ptr:
|
||||
t[0] = lex_or_tok[0]
|
||||
else:
|
||||
t.lex = lex_or_tok
|
||||
|
@ -218,6 +219,7 @@ cdef class Doc:
|
|||
t.idx = 0
|
||||
else:
|
||||
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||
assert t.lex.orth != 0
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
self._py_tokens.append(None)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import unicode_literals
|
||||
from collections import defaultdict
|
||||
|
||||
from ..structs cimport Morphology, TokenC, LexemeC
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..parts_of_speech cimport univ_pos_t
|
||||
|
|
|
@ -20,6 +20,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
|||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from ..attrs cimport IS_OOV
|
||||
|
||||
from ..lexeme cimport Lexeme
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
|
|
|
@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64
|
|||
from .structs cimport LexemeC, TokenC
|
||||
from .typedefs cimport utf8_t, attr_t, hash_t
|
||||
from .strings cimport StringStore
|
||||
from .morphology cimport Morphology
|
||||
|
||||
|
||||
cdef LexemeC EMPTY_LEXEME
|
||||
|
@ -14,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
|
|||
|
||||
cdef union LexemesOrTokens:
|
||||
const LexemeC* const* lexemes
|
||||
TokenC* tokens
|
||||
const TokenC* tokens
|
||||
|
||||
|
||||
cdef struct _Cached:
|
||||
|
@ -27,15 +28,18 @@ cdef class Vocab:
|
|||
cpdef public lexeme_props_getter
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cdef readonly object pos_tags
|
||||
cpdef readonly Morphology morphology
|
||||
cdef readonly int length
|
||||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
cdef public float oov_prob
|
||||
cdef public object get_lex_attr
|
||||
cdef public object pos_tags
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||
|
||||
|
|
|
@ -17,10 +17,12 @@ from .strings cimport hash_string
|
|||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from . import util
|
||||
from .serialize.packer cimport Packer
|
||||
from .attrs cimport PROB
|
||||
|
||||
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
@ -35,30 +37,31 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
|||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False):
|
||||
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
#self.pos_tags = pos_tags if pos_tags is not None else {}
|
||||
self.pos_tags = {}
|
||||
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.repvec_length = 0
|
||||
self.length = 0
|
||||
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
||||
if data_dir is not None:
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'),
|
||||
path.join(data_dir, 'lexemes.bin'))
|
||||
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
|
||||
self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
|
||||
|
||||
self.length = 1
|
||||
self._serializer = None
|
||||
self.data_dir = data_dir
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, get_lex_attr=None, vectors=None):
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
|
||||
|
||||
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||
return self
|
||||
|
||||
property serializer:
|
||||
def __get__(self):
|
||||
|
@ -84,7 +87,9 @@ cdef class Vocab:
|
|||
cdef LexemeC* lex
|
||||
cdef hash_t key = hash_string(string)
|
||||
lex = <LexemeC*>self._by_hash.get(key)
|
||||
cdef size_t addr
|
||||
if lex != NULL:
|
||||
assert lex.orth == self.strings[string]
|
||||
return lex
|
||||
else:
|
||||
return self._new_lexeme(mem, string)
|
||||
|
@ -103,16 +108,29 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, self.strings[orth])
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||
cdef hash_t key
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
mem = self.mem
|
||||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
for attr, func in self.lex_attr_getters.items():
|
||||
Lexeme.set_struct_attr(lex, attr, func(string))
|
||||
lex.orth = self.strings[string]
|
||||
lex.length = len(string)
|
||||
lex.id = self.length
|
||||
if self.get_lex_attr is not None:
|
||||
for attr, func in self.get_lex_attr.items():
|
||||
value = func(string)
|
||||
if isinstance(value, unicode):
|
||||
value = self.strings[value]
|
||||
if attr == PROB:
|
||||
lex.prob = value
|
||||
else:
|
||||
Lexeme.set_struct_attr(lex, attr, value)
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
else:
|
||||
self._add_lex_to_vocab(hash_string(string), lex)
|
||||
key = hash_string(string)
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
assert lex != NULL, string
|
||||
return lex
|
||||
|
||||
|
@ -125,7 +143,7 @@ cdef class Vocab:
|
|||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in self._by_orth.items():
|
||||
yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length)
|
||||
yield Lexeme(self, orth)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
|
@ -142,23 +160,29 @@ cdef class Vocab:
|
|||
An instance of the Lexeme Python class, with data copied on
|
||||
instantiation.
|
||||
'''
|
||||
cdef const LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == int:
|
||||
orth = id_or_string
|
||||
lexeme = <LexemeC*>self._by_orth.get(orth)
|
||||
if lexeme == NULL:
|
||||
raise KeyError(id_or_string)
|
||||
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
|
||||
elif type(id_or_string) == unicode:
|
||||
lexeme = self.get(self.mem, id_or_string)
|
||||
assert lexeme.orth == self.strings[id_or_string]
|
||||
if type(id_or_string) == unicode:
|
||||
orth = self.strings[id_or_string]
|
||||
else:
|
||||
raise ValueError("Vocab unable to map type: "
|
||||
"%s. Maps unicode --> Lexeme or "
|
||||
"int --> Lexeme" % str(type(id_or_string)))
|
||||
return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length)
|
||||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
||||
cdef const TokenC* make_fused_token(self, substrings) except NULL:
|
||||
cdef int i
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
token = &tokens[i]
|
||||
# Set the special tokens up to have morphology and lemmas if
|
||||
# specified, otherwise use the part-of-speech tag (if specified)
|
||||
token.lex = <LexemeC*>self.get(self.mem, props['F'])
|
||||
if 'pos' in props:
|
||||
self.morphology.assign_tag(token, props['pos'])
|
||||
if 'L' in props:
|
||||
tokens[i].lemma = self.strings[props['L']]
|
||||
for feature, value in props.get('morph', {}).items():
|
||||
self.morphology.assign_feature(&token.morph, feature, value)
|
||||
return tokens
|
||||
|
||||
def dump(self, loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_initial(EN):
|
||||
doc = EN.tokenizer(u'I ate the pizza with anchovies.')
|
||||
EN.tagger(doc)
|
||||
|
|
|
@ -41,25 +41,10 @@ def test_attribute():
|
|||
|
||||
|
||||
def test_vocab_codec():
|
||||
def get_lex_props(string, prob):
|
||||
return {
|
||||
'flags': 0,
|
||||
'length': len(string),
|
||||
'orth': string,
|
||||
'lower': string,
|
||||
'norm': string,
|
||||
'shape': string,
|
||||
'prefix': string[0],
|
||||
'suffix': string[-3:],
|
||||
'cluster': 0,
|
||||
'prob': prob,
|
||||
'sentiment': 0
|
||||
}
|
||||
|
||||
vocab = Vocab()
|
||||
vocab['dog'] = get_lex_props('dog', 0.001)
|
||||
vocab['the'] = get_lex_props('the', 0.05)
|
||||
vocab['jumped'] = get_lex_props('jumped', 0.005)
|
||||
lex = vocab['dog']
|
||||
lex = vocab['the']
|
||||
lex = vocab['jumped']
|
||||
|
||||
codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab])
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import re
|
|||
import pytest
|
||||
import numpy
|
||||
|
||||
from spacy.language import Language
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tokens.doc import Doc
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer
|
|||
from spacy.serialize.bits import BitArray
|
||||
|
||||
|
||||
def get_lex_props(string, prob=-22, is_oov=False):
|
||||
return {
|
||||
'flags': 0,
|
||||
'length': len(string),
|
||||
'orth': string,
|
||||
'lower': string,
|
||||
'norm': string,
|
||||
'shape': string,
|
||||
'prefix': string[0],
|
||||
'suffix': string[-3:],
|
||||
'cluster': 0,
|
||||
'prob': prob,
|
||||
'sentiment': 0
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
vocab = Vocab(get_lex_props=get_lex_props)
|
||||
vocab['dog'] = get_lex_props('dog', 0.001)
|
||||
vocab = Vocab(Language.default_lex_attrs())
|
||||
lex = vocab['dog']
|
||||
assert vocab[vocab.strings['dog']].orth_ == 'dog'
|
||||
vocab['the'] = get_lex_props('the', 0.01)
|
||||
vocab['quick'] = get_lex_props('quick', 0.005)
|
||||
vocab['jumped'] = get_lex_props('jumped', 0.007)
|
||||
lex = vocab['the']
|
||||
lex = vocab['quick']
|
||||
lex = vocab['jumped']
|
||||
return vocab
|
||||
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ def tagged(EN):
|
|||
tokens = EN(string, tag=True, parse=False)
|
||||
return tokens
|
||||
|
||||
@pytest.mark.models
|
||||
def test_spaces(tagged):
|
||||
assert tagged[0].pos != SPACE
|
||||
assert tagged[0].pos_ != 'SPACE'
|
||||
|
|
|
@ -1,80 +1,81 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
|
||||
import pytest
|
||||
|
||||
@pytest.mark.models
|
||||
def test_1():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
# Load the pipeline, and call it with some text.
|
||||
nlp = spacy.en.English()
|
||||
tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
tag=True, parse=False)
|
||||
o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
|
||||
|
||||
o = nlp.vocab[u'back'].prob
|
||||
assert o == -7.033305644989014
|
||||
o = nlp.vocab[u'not'].prob
|
||||
assert o == -5.332601070404053
|
||||
o = nlp.vocab[u'quietly'].prob
|
||||
assert o == -11.994928359985352
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test2():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
nlp = spacy.en.English()
|
||||
# Find log probability of Nth most frequent word
|
||||
probs = [lex.prob for lex in nlp.vocab]
|
||||
probs.sort()
|
||||
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
|
||||
@pytest.mark.models
|
||||
def test3():
|
||||
import spacy.en
|
||||
from spacy.parts_of_speech import ADV
|
||||
nlp = spacy.en.English()
|
||||
# Find log probability of Nth most frequent word
|
||||
probs = [lex.prob for lex in nlp.vocab]
|
||||
probs.sort()
|
||||
is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
|
||||
pleaded = tokens[7]
|
||||
assert pleaded.repvec.shape == (300,)
|
||||
o = pleaded.repvec[:5]
|
||||
assert sum(o) != 0
|
||||
from numpy import dot
|
||||
from numpy.linalg import norm
|
||||
|
||||
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
|
||||
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
words.reverse()
|
||||
o = [w.orth_ for w in words[0:20]]
|
||||
assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
|
||||
u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
|
||||
u'countersued', u'remonstrated', u'begged', u'apologised',
|
||||
u'consented', u'acquiesced', u'petitioned', u'quarreled',
|
||||
u'appealed', u'pleading']
|
||||
o = [w.orth_ for w in words[50:60]]
|
||||
assert o == [u'martialed', u'counselled', u'bragged',
|
||||
u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
|
||||
u'dissented', u'yearned']
|
||||
o = [w.orth_ for w in words[100:110]]
|
||||
assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
|
||||
u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
|
||||
u'clerked']
|
||||
|
||||
#o = [w.orth_ for w in words[1000:1010]]
|
||||
#assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
|
||||
# u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
|
||||
#o = [w.orth_ for w in words[50000:50010]]
|
||||
#assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
|
||||
# u'dirty', u'rims', u'artists']
|
||||
#@pytest.mark.models
|
||||
#def test_1():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# # Load the pipeline, and call it with some text.
|
||||
# nlp = spacy.en.English()
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
# tag=True, parse=False)
|
||||
# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’"
|
||||
#
|
||||
# o = nlp.vocab[u'back'].prob
|
||||
# assert o == -7.033305644989014
|
||||
# o = nlp.vocab[u'not'].prob
|
||||
# assert o == -5.332601070404053
|
||||
# o = nlp.vocab[u'quietly'].prob
|
||||
# assert o == -11.994928359985352
|
||||
#
|
||||
#
|
||||
#@pytest.mark.m
|
||||
#def test2():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# nlp = spacy.en.English()
|
||||
# # Find log probability of Nth most frequent word
|
||||
# probs = [lex.prob for lex in nlp.vocab]
|
||||
# probs.sort()
|
||||
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
#
|
||||
#@pytest.mark.models
|
||||
#def test3():
|
||||
# import spacy.en
|
||||
# from spacy.parts_of_speech import ADV
|
||||
# nlp = spacy.en.English()
|
||||
# # Find log probability of Nth most frequent word
|
||||
# probs = [lex.prob for lex in nlp.vocab]
|
||||
# probs.sort()
|
||||
# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’'
|
||||
#
|
||||
# pleaded = tokens[7]
|
||||
# assert pleaded.repvec.shape == (300,)
|
||||
# o = pleaded.repvec[:5]
|
||||
# assert sum(o) != 0
|
||||
# from numpy import dot
|
||||
# from numpy.linalg import norm
|
||||
#
|
||||
# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec]
|
||||
# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
# words.reverse()
|
||||
# o = [w.orth_ for w in words[0:20]]
|
||||
# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded',
|
||||
# u'pleads', u'testified', u'conspired', u'motioned', u'demurred',
|
||||
# u'countersued', u'remonstrated', u'begged', u'apologised',
|
||||
# u'consented', u'acquiesced', u'petitioned', u'quarreled',
|
||||
# u'appealed', u'pleading']
|
||||
# o = [w.orth_ for w in words[50:60]]
|
||||
# assert o == [u'martialed', u'counselled', u'bragged',
|
||||
# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused',
|
||||
# u'dissented', u'yearned']
|
||||
# o = [w.orth_ for w in words[100:110]]
|
||||
# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced',
|
||||
# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed',
|
||||
# u'clerked']
|
||||
#
|
||||
# #o = [w.orth_ for w in words[1000:1010]]
|
||||
# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled',
|
||||
# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged']
|
||||
# #o = [w.orth_ for w in words[50000:50010]]
|
||||
# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid',
|
||||
# # u'dirty', u'rims', u'artists']
|
||||
|
|
Loading…
Reference in New Issue
Block a user