* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules

This commit is contained in:
Matthew Honnibal 2014-12-07 23:52:41 +11:00
parent 327383e38a
commit ef4398b204
11 changed files with 127 additions and 154 deletions

View File

@ -55,7 +55,6 @@ exts = [
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),

View File

@ -1,64 +1 @@
from thinc.typedefs cimport atom_t
from .tokens cimport TokenC
cpdef enum:
P2_sic
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_sense
P1_sic
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_sense
W_sic
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_sense
N1_sic
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_sense
N2_sic
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_sense
N_FIELDS
cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense

View File

@ -1,5 +1,9 @@
from spacy.lang cimport Language
from spacy.tokens cimport Tokens
from thinc.typedefs cimport atom_t
from .lang cimport Language
from .tokens cimport Tokens
from .tokens cimport TokenC
# Flags
cpdef enum FlagID:
@ -28,5 +32,67 @@ cpdef enum FlagID:
IN_NAMES
cpdef enum:
P2_sic
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_sense
P1_sic
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_sense
W_sic
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_sense
N1_sic
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_sense
N2_sic
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_sense
N_CONTEXT_FIELDS
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense
cdef class English(Language):
pass

View File

@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
'''
# TODO
#The script translate_treebank_tokenization can be used to transform a treebank's
#annotation to use one of the spacy tokenization schemes.
from __future__ import unicode_literals
cimport lang
@ -42,6 +37,32 @@ from .typedefs cimport flags_t
import orth
POS_TEMPLATES = (
(W_sic,),
(P1_sic,),
(N1_sic,),
(N2_sic,),
(P2_sic,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_sic),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
)
cdef class English(Language):
"""English tokenizer, tightly coupled to lexicon.
@ -49,6 +70,9 @@ cdef class English(Language):
name (unicode): The two letter code used by Wikipedia for the language.
lexicon (Lexicon): The lexicon. Exposes the lookup method.
"""
def get_props(self, unicode string):
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
def set_flags(self, unicode string):
cdef flags_t flags = 0
flags |= orth.is_alpha(string) << IS_ALPHA
@ -64,5 +88,22 @@ cdef class English(Language):
flags |= orth.like_number(string) << LIKE_NUMBER
return flags
def set_pos(self, Tokens tokens):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
for i in range(tokens.length):
fill_pos_context(context, i, tokens.data)
tokens.data[i].pos = self.pos_tagger.predict(context)
def train_pos(self, Tokens tokens, golds):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
c = 0
for i in range(tokens.length):
fill_pos_context(context, i, tokens.data)
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
c += tokens.data[i].pos == golds[i]
return c
EN = English('en')

View File

@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr
cdef class Lexicon:
cpdef public set_flags
cpdef public get_lex_props
cdef Pool mem
cpdef readonly size_t size
cpdef readonly StringStore strings

View File

@ -37,7 +37,7 @@ cdef class Language:
self._prefix_re = re.compile(prefix)
self._suffix_re = re.compile(suffix)
self._infix_re = re.compile(infix)
self.lexicon = Lexicon(self.set_flags)
self.lexicon = Lexicon(self.get_props)
self._load_special_tokenization(rules)
self.pos_tagger = None
@ -249,13 +249,13 @@ cdef class Lexicon:
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
'''
def __init__(self, object set_flags=None):
def __init__(self, object get_props):
self.mem = Pool()
self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.size = 2
self.set_flags = set_flags
self.get_lex_props = get_props
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
@ -267,9 +267,10 @@ cdef class Lexicon:
return lex
if string.n < 3:
mem = self.mem
cdef unicode py_string = string.chars[:string.n]
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
self.get_lex_props(py_string))
if mem is self.mem:
self._map.set(string.key, lex)
while self.lexemes.size() < (lex.id + 1):

View File

@ -72,17 +72,14 @@ cpdef enum attr_id_t:
ID
SIC
STEM
DENSE
SHAPE
ASCIIED
PREFIX
SUFFIX
LENGTH
CLUSTER
POS_TYPE
SENSE_TYPE
cdef struct Lexeme:
@ -90,20 +87,16 @@ cdef struct Lexeme:
attr_t id
attr_t sic
attr_t stem
attr_t dense
attr_t shape
attr_t asciied
attr_t prefix
attr_t suffix
attr_t length
attr_t cluster
attr_t pos_type
attr_t sense_type
float prob
float lower_pc
float sentiment
@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
return lex.sic
elif feat_name == DENSE:
return lex.dense
elif feat_name == STEM:
return lex.stem
elif feat_name == SHAPE:
return lex.shape
elif feat_name == ASCIIED:
return lex.asciied
elif feat_name == PREFIX:
return lex.prefix
elif feat_name == SUFFIX:
@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
return lex.cluster
elif feat_name == POS_TYPE:
return lex.pos_type
elif feat_name == SENSE_TYPE:
return lex.sense_type
else:
return 0

View File

@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.cluster = props.get('cluster', 0)
lex.pos_type = props.get('pos_type', 0)
lex.sense_type = props.get('sense_type', 0)
lex.prob = props.get('prob', 0)
lex.lower_pc = props.get('lower_pc', 0.0)
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[orth.word_shape(string)]
lex.dense = lex.sic if lex.prob >= -10 else lex.shape
lex.stem = string_store[props.get('stem', string)]
lex.asciied = string_store[orth.asciied(string)]
lex.dense = string_store[props['dense']]
lex.flags = props.get('flags', 0)
return lex

View File

@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMap
from .typedefs cimport hash_t
from .tokens cimport Tokens
cpdef enum TagType:
POS
SENSE
cdef class Tagger:
cpdef int set_tags(self, Tokens tokens) except -1
cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
cdef class_t predict(self, atom_t* context, object golds=*) except *
cpdef readonly Pool mem
cpdef readonly Extractor extractor
cpdef readonly LinearModel model
cpdef readonly TagType tag_type
cpdef readonly list tag_names
cdef dict tagdict

View File

@ -2,9 +2,6 @@
from __future__ import unicode_literals
from __future__ import division
from .context cimport fill_context
from .context cimport N_FIELDS
from os import path
import os
import shutil
@ -15,12 +12,11 @@ import cython
from thinc.features cimport Feature, count_feats
def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'tag_type': tag_type,
'templates': templates,
'tag_names': tag_names,
'tag_counts': tag_counts,
@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
json.dump(config, file_)
def train(train_sents, model_dir, nr_iter=10):
cdef Tokens tokens
cdef Tagger tagger = Tagger(model_dir)
cdef int i
cdef class_t guess = 0
cdef class_t gold
for _ in range(nr_iter):
n_corr = 0
total = 0
for tokens, golds in train_sents:
assert len(tokens) == len(golds), [t.string for t in tokens]
for i in range(tokens.length):
gold = golds[i]
guess = tagger.predict(i, tokens, [gold])
tokens.set_tag(i, tagger.tag_type, guess)
total += 1
n_corr += guess == gold
print('%.4f' % ((n_corr / total) * 100))
random.shuffle(train_sents)
tagger.model.end_training()
tagger.model.dump(path.join(model_dir, 'model'))
cdef class Tagger:
"""Assign part-of-speech, named entity or supersense tags, using greedy
decoding. The tagger reads its model and configuration from disk.
@ -61,26 +34,13 @@ cdef class Tagger:
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type']
self.tagdict = _make_tag_dict(cfg['tag_counts'])
self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
cpdef int set_tags(self, Tokens tokens) except -1:
"""Assign tags to a Tokens object.
>>> tokens = EN.tokenize(u'An example sentence.')
>>> assert tokens[0].pos == 'NO_TAG'
>>> EN.pos_tagger.set_tags(tokens)
>>> assert tokens[0].pos == 'DT'
"""
cdef int i
for i in range(tokens.length):
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
cdef class_t predict(self, atom_t* context, object golds=None) except *:
"""Predict the tag of tokens[i]. The tagger remembers the features and
prediction, in case you later call tell_answer.
@ -88,11 +48,6 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
"""
cdef atom_t sic = tokens.data[i].lex.sic
if sic in self.tagdict:
return self.tagdict[sic]
cdef atom_t[N_FIELDS] context
fill_context(context, i, tokens.data)
cdef int n_feats
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
cdef weight_t* scores = self.model.get_scores(feats, n_feats)

View File

@ -140,11 +140,11 @@ cdef class Token:
self.cluster = lex['cluster']
self.length = lex['length']
self.postype = lex['pos_type']
self.sensetype = lex['sense_type']
self.sensetype = 0
self.sic = lex['sic']
self.norm = lex['dense']
self.shape = lex['shape']
self.suffix = lex['asciied']
self.suffix = lex['suffix']
self.prefix = lex['prefix']
self.prob = lex['prob']