mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules
This commit is contained in:
parent
327383e38a
commit
ef4398b204
1
setup.py
1
setup.py
|
@ -55,7 +55,6 @@ exts = [
|
|||
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
||||
Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||
|
|
|
@ -1,64 +1 @@
|
|||
from thinc.typedefs cimport atom_t
|
||||
from .tokens cimport TokenC
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_sense
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_sense
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_sense
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_sense
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_sense
|
||||
|
||||
N_FIELDS
|
||||
|
||||
|
||||
cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.sense
|
||||
|
|
70
spacy/en.pxd
70
spacy/en.pxd
|
@ -1,5 +1,9 @@
|
|||
from spacy.lang cimport Language
|
||||
from spacy.tokens cimport Tokens
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .lang cimport Language
|
||||
from .tokens cimport Tokens
|
||||
from .tokens cimport TokenC
|
||||
|
||||
|
||||
# Flags
|
||||
cpdef enum FlagID:
|
||||
|
@ -28,5 +32,67 @@ cpdef enum FlagID:
|
|||
IN_NAMES
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
P2_suffix
|
||||
P2_pos
|
||||
P2_sense
|
||||
|
||||
P1_sic
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
P1_suffix
|
||||
P1_pos
|
||||
P1_sense
|
||||
|
||||
W_sic
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
W_suffix
|
||||
W_pos
|
||||
W_sense
|
||||
|
||||
N1_sic
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
N1_suffix
|
||||
N1_pos
|
||||
N1_sense
|
||||
|
||||
N2_sic
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
N2_suffix
|
||||
N2_pos
|
||||
N2_sense
|
||||
|
||||
N_CONTEXT_FIELDS
|
||||
|
||||
|
||||
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
context[4] = t.lex.suffix
|
||||
context[5] = t.pos
|
||||
context[6] = t.sense
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
pass
|
||||
|
|
51
spacy/en.pyx
51
spacy/en.pyx
|
@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
|
|||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||
'''
|
||||
# TODO
|
||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
||||
#annotation to use one of the spacy tokenization schemes.
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
cimport lang
|
||||
|
@ -42,6 +37,32 @@ from .typedefs cimport flags_t
|
|||
import orth
|
||||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
(W_sic,),
|
||||
(P1_sic,),
|
||||
(N1_sic,),
|
||||
(N2_sic,),
|
||||
(P2_sic,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
||||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_sic),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
(W_shape,),
|
||||
(W_cluster,),
|
||||
(N1_cluster,),
|
||||
(N2_cluster,),
|
||||
(P1_cluster,),
|
||||
(P2_cluster,),
|
||||
)
|
||||
|
||||
|
||||
cdef class English(Language):
|
||||
"""English tokenizer, tightly coupled to lexicon.
|
||||
|
||||
|
@ -49,6 +70,9 @@ cdef class English(Language):
|
|||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
def get_props(self, unicode string):
|
||||
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||
|
||||
def set_flags(self, unicode string):
|
||||
cdef flags_t flags = 0
|
||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||
|
@ -64,5 +88,22 @@ cdef class English(Language):
|
|||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
def set_pos(self, Tokens tokens):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, tokens.data)
|
||||
tokens.data[i].pos = self.pos_tagger.predict(context)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
c = 0
|
||||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, tokens.data)
|
||||
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
c += tokens.data[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr
|
|||
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef public set_flags
|
||||
cpdef public get_lex_props
|
||||
cdef Pool mem
|
||||
cpdef readonly size_t size
|
||||
cpdef readonly StringStore strings
|
||||
|
|
|
@ -37,7 +37,7 @@ cdef class Language:
|
|||
self._prefix_re = re.compile(prefix)
|
||||
self._suffix_re = re.compile(suffix)
|
||||
self._infix_re = re.compile(infix)
|
||||
self.lexicon = Lexicon(self.set_flags)
|
||||
self.lexicon = Lexicon(self.get_props)
|
||||
self._load_special_tokenization(rules)
|
||||
self.pos_tagger = None
|
||||
|
||||
|
@ -249,13 +249,13 @@ cdef class Lexicon:
|
|||
|
||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||
'''
|
||||
def __init__(self, object set_flags=None):
|
||||
def __init__(self, object get_props):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.size = 2
|
||||
self.set_flags = set_flags
|
||||
self.get_lex_props = get_props
|
||||
|
||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||
|
@ -267,9 +267,10 @@ cdef class Lexicon:
|
|||
return lex
|
||||
if string.n < 3:
|
||||
mem = self.mem
|
||||
cdef unicode py_string = string.chars[:string.n]
|
||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
|
||||
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
|
||||
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
|
||||
self.get_lex_props(py_string))
|
||||
if mem is self.mem:
|
||||
self._map.set(string.key, lex)
|
||||
while self.lexemes.size() < (lex.id + 1):
|
||||
|
|
|
@ -72,17 +72,14 @@ cpdef enum attr_id_t:
|
|||
|
||||
ID
|
||||
SIC
|
||||
STEM
|
||||
DENSE
|
||||
SHAPE
|
||||
ASCIIED
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
SENSE_TYPE
|
||||
|
||||
|
||||
cdef struct Lexeme:
|
||||
|
@ -90,20 +87,16 @@ cdef struct Lexeme:
|
|||
|
||||
attr_t id
|
||||
attr_t sic
|
||||
attr_t stem
|
||||
attr_t dense
|
||||
attr_t shape
|
||||
attr_t asciied
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
attr_t length
|
||||
attr_t cluster
|
||||
attr_t pos_type
|
||||
attr_t sense_type
|
||||
|
||||
float prob
|
||||
float lower_pc
|
||||
float sentiment
|
||||
|
||||
|
||||
|
@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
|||
return lex.sic
|
||||
elif feat_name == DENSE:
|
||||
return lex.dense
|
||||
elif feat_name == STEM:
|
||||
return lex.stem
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == ASCIIED:
|
||||
return lex.asciied
|
||||
elif feat_name == PREFIX:
|
||||
return lex.prefix
|
||||
elif feat_name == SUFFIX:
|
||||
|
@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
|||
return lex.cluster
|
||||
elif feat_name == POS_TYPE:
|
||||
return lex.pos_type
|
||||
elif feat_name == SENSE_TYPE:
|
||||
return lex.sense_type
|
||||
else:
|
||||
return 0
|
||||
|
|
|
@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
|||
|
||||
lex.cluster = props.get('cluster', 0)
|
||||
lex.pos_type = props.get('pos_type', 0)
|
||||
lex.sense_type = props.get('sense_type', 0)
|
||||
lex.prob = props.get('prob', 0)
|
||||
|
||||
lex.lower_pc = props.get('lower_pc', 0.0)
|
||||
|
||||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[orth.word_shape(string)]
|
||||
lex.dense = lex.sic if lex.prob >= -10 else lex.shape
|
||||
lex.stem = string_store[props.get('stem', string)]
|
||||
lex.asciied = string_store[orth.asciied(string)]
|
||||
lex.dense = string_store[props['dense']]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
|
|
@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
|
|||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cpdef enum TagType:
|
||||
POS
|
||||
SENSE
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cpdef int set_tags(self, Tokens tokens) except -1
|
||||
cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
|
||||
cdef class_t predict(self, atom_t* context, object golds=*) except *
|
||||
|
||||
cpdef readonly Pool mem
|
||||
cpdef readonly Extractor extractor
|
||||
cpdef readonly LinearModel model
|
||||
|
||||
cpdef readonly TagType tag_type
|
||||
cpdef readonly list tag_names
|
||||
cdef dict tagdict
|
||||
|
|
|
@ -2,9 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
|
||||
from .context cimport fill_context
|
||||
from .context cimport N_FIELDS
|
||||
|
||||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
|
@ -15,12 +12,11 @@ import cython
|
|||
from thinc.features cimport Feature, count_feats
|
||||
|
||||
|
||||
def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
|
||||
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'tag_type': tag_type,
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
'tag_counts': tag_counts,
|
||||
|
@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
|
|||
json.dump(config, file_)
|
||||
|
||||
|
||||
def train(train_sents, model_dir, nr_iter=10):
|
||||
cdef Tokens tokens
|
||||
cdef Tagger tagger = Tagger(model_dir)
|
||||
cdef int i
|
||||
cdef class_t guess = 0
|
||||
cdef class_t gold
|
||||
for _ in range(nr_iter):
|
||||
n_corr = 0
|
||||
total = 0
|
||||
for tokens, golds in train_sents:
|
||||
assert len(tokens) == len(golds), [t.string for t in tokens]
|
||||
for i in range(tokens.length):
|
||||
gold = golds[i]
|
||||
guess = tagger.predict(i, tokens, [gold])
|
||||
tokens.set_tag(i, tagger.tag_type, guess)
|
||||
total += 1
|
||||
n_corr += guess == gold
|
||||
print('%.4f' % ((n_corr / total) * 100))
|
||||
random.shuffle(train_sents)
|
||||
tagger.model.end_training()
|
||||
tagger.model.dump(path.join(model_dir, 'model'))
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
||||
decoding. The tagger reads its model and configuration from disk.
|
||||
|
@ -61,26 +34,13 @@ cdef class Tagger:
|
|||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.tag_type = cfg['tag_type']
|
||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||
self.extractor = Extractor(templates)
|
||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
|
||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
||||
"""Assign tags to a Tokens object.
|
||||
|
||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
||||
>>> assert tokens[0].pos == 'NO_TAG'
|
||||
>>> EN.pos_tagger.set_tags(tokens)
|
||||
>>> assert tokens[0].pos == 'DT'
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
|
||||
|
||||
cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
|
||||
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
||||
prediction, in case you later call tell_answer.
|
||||
|
||||
|
@ -88,11 +48,6 @@ cdef class Tagger:
|
|||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
||||
"""
|
||||
cdef atom_t sic = tokens.data[i].lex.sic
|
||||
if sic in self.tagdict:
|
||||
return self.tagdict[sic]
|
||||
cdef atom_t[N_FIELDS] context
|
||||
fill_context(context, i, tokens.data)
|
||||
cdef int n_feats
|
||||
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
|
||||
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
|
||||
|
|
|
@ -140,11 +140,11 @@ cdef class Token:
|
|||
self.cluster = lex['cluster']
|
||||
self.length = lex['length']
|
||||
self.postype = lex['pos_type']
|
||||
self.sensetype = lex['sense_type']
|
||||
self.sensetype = 0
|
||||
self.sic = lex['sic']
|
||||
self.norm = lex['dense']
|
||||
self.shape = lex['shape']
|
||||
self.suffix = lex['asciied']
|
||||
self.suffix = lex['suffix']
|
||||
self.prefix = lex['prefix']
|
||||
|
||||
self.prob = lex['prob']
|
||||
|
|
Loading…
Reference in New Issue
Block a user