mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules
This commit is contained in:
parent
327383e38a
commit
ef4398b204
1
setup.py
1
setup.py
|
@ -55,7 +55,6 @@ exts = [
|
||||||
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
|
||||||
Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
|
|
||||||
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||||
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||||
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||||
|
|
|
@ -1,64 +1 @@
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
from .tokens cimport TokenC
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
|
||||||
P2_sic
|
|
||||||
P2_cluster
|
|
||||||
P2_shape
|
|
||||||
P2_prefix
|
|
||||||
P2_suffix
|
|
||||||
P2_pos
|
|
||||||
P2_sense
|
|
||||||
|
|
||||||
P1_sic
|
|
||||||
P1_cluster
|
|
||||||
P1_shape
|
|
||||||
P1_prefix
|
|
||||||
P1_suffix
|
|
||||||
P1_pos
|
|
||||||
P1_sense
|
|
||||||
|
|
||||||
W_sic
|
|
||||||
W_cluster
|
|
||||||
W_shape
|
|
||||||
W_prefix
|
|
||||||
W_suffix
|
|
||||||
W_pos
|
|
||||||
W_sense
|
|
||||||
|
|
||||||
N1_sic
|
|
||||||
N1_cluster
|
|
||||||
N1_shape
|
|
||||||
N1_prefix
|
|
||||||
N1_suffix
|
|
||||||
N1_pos
|
|
||||||
N1_sense
|
|
||||||
|
|
||||||
N2_sic
|
|
||||||
N2_cluster
|
|
||||||
N2_shape
|
|
||||||
N2_prefix
|
|
||||||
N2_suffix
|
|
||||||
N2_pos
|
|
||||||
N2_sense
|
|
||||||
|
|
||||||
N_FIELDS
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
|
||||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
|
||||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
|
||||||
_fill_from_token(&context[W_sic], &tokens[i])
|
|
||||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
|
||||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|
||||||
context[0] = t.lex.sic
|
|
||||||
context[1] = t.lex.cluster
|
|
||||||
context[2] = t.lex.shape
|
|
||||||
context[3] = t.lex.prefix
|
|
||||||
context[4] = t.lex.suffix
|
|
||||||
context[5] = t.pos
|
|
||||||
context[6] = t.sense
|
|
||||||
|
|
70
spacy/en.pxd
70
spacy/en.pxd
|
@ -1,5 +1,9 @@
|
||||||
from spacy.lang cimport Language
|
from thinc.typedefs cimport atom_t
|
||||||
from spacy.tokens cimport Tokens
|
|
||||||
|
from .lang cimport Language
|
||||||
|
from .tokens cimport Tokens
|
||||||
|
from .tokens cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
# Flags
|
# Flags
|
||||||
cpdef enum FlagID:
|
cpdef enum FlagID:
|
||||||
|
@ -28,5 +32,67 @@ cpdef enum FlagID:
|
||||||
IN_NAMES
|
IN_NAMES
|
||||||
|
|
||||||
|
|
||||||
|
cpdef enum:
|
||||||
|
P2_sic
|
||||||
|
P2_cluster
|
||||||
|
P2_shape
|
||||||
|
P2_prefix
|
||||||
|
P2_suffix
|
||||||
|
P2_pos
|
||||||
|
P2_sense
|
||||||
|
|
||||||
|
P1_sic
|
||||||
|
P1_cluster
|
||||||
|
P1_shape
|
||||||
|
P1_prefix
|
||||||
|
P1_suffix
|
||||||
|
P1_pos
|
||||||
|
P1_sense
|
||||||
|
|
||||||
|
W_sic
|
||||||
|
W_cluster
|
||||||
|
W_shape
|
||||||
|
W_prefix
|
||||||
|
W_suffix
|
||||||
|
W_pos
|
||||||
|
W_sense
|
||||||
|
|
||||||
|
N1_sic
|
||||||
|
N1_cluster
|
||||||
|
N1_shape
|
||||||
|
N1_prefix
|
||||||
|
N1_suffix
|
||||||
|
N1_pos
|
||||||
|
N1_sense
|
||||||
|
|
||||||
|
N2_sic
|
||||||
|
N2_cluster
|
||||||
|
N2_shape
|
||||||
|
N2_prefix
|
||||||
|
N2_suffix
|
||||||
|
N2_pos
|
||||||
|
N2_sense
|
||||||
|
|
||||||
|
N_CONTEXT_FIELDS
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
|
||||||
|
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||||
|
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||||
|
_fill_from_token(&context[W_sic], &tokens[i])
|
||||||
|
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||||
|
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
context[0] = t.lex.sic
|
||||||
|
context[1] = t.lex.cluster
|
||||||
|
context[2] = t.lex.shape
|
||||||
|
context[3] = t.lex.prefix
|
||||||
|
context[4] = t.lex.suffix
|
||||||
|
context[5] = t.pos
|
||||||
|
context[6] = t.sense
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
pass
|
pass
|
||||||
|
|
51
spacy/en.pyx
51
spacy/en.pyx
|
@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
|
||||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||||
'''
|
'''
|
||||||
# TODO
|
|
||||||
#The script translate_treebank_tokenization can be used to transform a treebank's
|
|
||||||
#annotation to use one of the spacy tokenization schemes.
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
cimport lang
|
cimport lang
|
||||||
|
@ -42,6 +37,32 @@ from .typedefs cimport flags_t
|
||||||
import orth
|
import orth
|
||||||
|
|
||||||
|
|
||||||
|
POS_TEMPLATES = (
|
||||||
|
(W_sic,),
|
||||||
|
(P1_sic,),
|
||||||
|
(N1_sic,),
|
||||||
|
(N2_sic,),
|
||||||
|
(P2_sic,),
|
||||||
|
|
||||||
|
(W_suffix,),
|
||||||
|
(W_prefix,),
|
||||||
|
|
||||||
|
(P1_pos,),
|
||||||
|
(P2_pos,),
|
||||||
|
(P1_pos, P2_pos),
|
||||||
|
(P1_pos, W_sic),
|
||||||
|
(P1_suffix,),
|
||||||
|
(N1_suffix,),
|
||||||
|
|
||||||
|
(W_shape,),
|
||||||
|
(W_cluster,),
|
||||||
|
(N1_cluster,),
|
||||||
|
(N2_cluster,),
|
||||||
|
(P1_cluster,),
|
||||||
|
(P2_cluster,),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
"""English tokenizer, tightly coupled to lexicon.
|
"""English tokenizer, tightly coupled to lexicon.
|
||||||
|
|
||||||
|
@ -49,6 +70,9 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
|
def get_props(self, unicode string):
|
||||||
|
return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
|
||||||
|
|
||||||
def set_flags(self, unicode string):
|
def set_flags(self, unicode string):
|
||||||
cdef flags_t flags = 0
|
cdef flags_t flags = 0
|
||||||
flags |= orth.is_alpha(string) << IS_ALPHA
|
flags |= orth.is_alpha(string) << IS_ALPHA
|
||||||
|
@ -64,5 +88,22 @@ cdef class English(Language):
|
||||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
def set_pos(self, Tokens tokens):
|
||||||
|
cdef int i
|
||||||
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
for i in range(tokens.length):
|
||||||
|
fill_pos_context(context, i, tokens.data)
|
||||||
|
tokens.data[i].pos = self.pos_tagger.predict(context)
|
||||||
|
|
||||||
|
def train_pos(self, Tokens tokens, golds):
|
||||||
|
cdef int i
|
||||||
|
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||||
|
c = 0
|
||||||
|
for i in range(tokens.length):
|
||||||
|
fill_pos_context(context, i, tokens.data)
|
||||||
|
tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
|
c += tokens.data[i].pos == golds[i]
|
||||||
|
return c
|
||||||
|
|
||||||
|
|
||||||
EN = English('en')
|
EN = English('en')
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
cpdef public set_flags
|
cpdef public get_lex_props
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
|
|
|
@ -37,7 +37,7 @@ cdef class Language:
|
||||||
self._prefix_re = re.compile(prefix)
|
self._prefix_re = re.compile(prefix)
|
||||||
self._suffix_re = re.compile(suffix)
|
self._suffix_re = re.compile(suffix)
|
||||||
self._infix_re = re.compile(infix)
|
self._infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(self.set_flags)
|
self.lexicon = Lexicon(self.get_props)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
self.pos_tagger = None
|
self.pos_tagger = None
|
||||||
|
|
||||||
|
@ -249,13 +249,13 @@ cdef class Lexicon:
|
||||||
|
|
||||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||||
'''
|
'''
|
||||||
def __init__(self, object set_flags=None):
|
def __init__(self, object get_props):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap(2 ** 20)
|
self._map = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.size = 2
|
self.size = 2
|
||||||
self.set_flags = set_flags
|
self.get_lex_props = get_props
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
||||||
|
@ -267,9 +267,10 @@ cdef class Lexicon:
|
||||||
return lex
|
return lex
|
||||||
if string.n < 3:
|
if string.n < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
|
cdef unicode py_string = string.chars[:string.n]
|
||||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
||||||
lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
|
lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
|
||||||
self.strings, {'flags': self.set_flags(string.chars[:string.n])})
|
self.get_lex_props(py_string))
|
||||||
if mem is self.mem:
|
if mem is self.mem:
|
||||||
self._map.set(string.key, lex)
|
self._map.set(string.key, lex)
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
while self.lexemes.size() < (lex.id + 1):
|
||||||
|
|
|
@ -72,17 +72,14 @@ cpdef enum attr_id_t:
|
||||||
|
|
||||||
ID
|
ID
|
||||||
SIC
|
SIC
|
||||||
STEM
|
|
||||||
DENSE
|
DENSE
|
||||||
SHAPE
|
SHAPE
|
||||||
ASCIIED
|
|
||||||
PREFIX
|
PREFIX
|
||||||
SUFFIX
|
SUFFIX
|
||||||
|
|
||||||
LENGTH
|
LENGTH
|
||||||
CLUSTER
|
CLUSTER
|
||||||
POS_TYPE
|
POS_TYPE
|
||||||
SENSE_TYPE
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
|
@ -90,20 +87,16 @@ cdef struct Lexeme:
|
||||||
|
|
||||||
attr_t id
|
attr_t id
|
||||||
attr_t sic
|
attr_t sic
|
||||||
attr_t stem
|
|
||||||
attr_t dense
|
attr_t dense
|
||||||
attr_t shape
|
attr_t shape
|
||||||
attr_t asciied
|
|
||||||
attr_t prefix
|
attr_t prefix
|
||||||
attr_t suffix
|
attr_t suffix
|
||||||
|
|
||||||
attr_t length
|
attr_t length
|
||||||
attr_t cluster
|
attr_t cluster
|
||||||
attr_t pos_type
|
attr_t pos_type
|
||||||
attr_t sense_type
|
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
float lower_pc
|
|
||||||
float sentiment
|
float sentiment
|
||||||
|
|
||||||
|
|
||||||
|
@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
elif feat_name == DENSE:
|
elif feat_name == DENSE:
|
||||||
return lex.dense
|
return lex.dense
|
||||||
elif feat_name == STEM:
|
|
||||||
return lex.stem
|
|
||||||
elif feat_name == SHAPE:
|
elif feat_name == SHAPE:
|
||||||
return lex.shape
|
return lex.shape
|
||||||
elif feat_name == ASCIIED:
|
|
||||||
return lex.asciied
|
|
||||||
elif feat_name == PREFIX:
|
elif feat_name == PREFIX:
|
||||||
return lex.prefix
|
return lex.prefix
|
||||||
elif feat_name == SUFFIX:
|
elif feat_name == SUFFIX:
|
||||||
|
@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
||||||
return lex.cluster
|
return lex.cluster
|
||||||
elif feat_name == POS_TYPE:
|
elif feat_name == POS_TYPE:
|
||||||
return lex.pos_type
|
return lex.pos_type
|
||||||
elif feat_name == SENSE_TYPE:
|
|
||||||
return lex.sense_type
|
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||||
|
|
||||||
lex.cluster = props.get('cluster', 0)
|
lex.cluster = props.get('cluster', 0)
|
||||||
lex.pos_type = props.get('pos_type', 0)
|
lex.pos_type = props.get('pos_type', 0)
|
||||||
lex.sense_type = props.get('sense_type', 0)
|
|
||||||
lex.prob = props.get('prob', 0)
|
lex.prob = props.get('prob', 0)
|
||||||
|
|
||||||
lex.lower_pc = props.get('lower_pc', 0.0)
|
|
||||||
|
|
||||||
lex.prefix = string_store[string[:1]]
|
lex.prefix = string_store[string[:1]]
|
||||||
lex.suffix = string_store[string[-3:]]
|
lex.suffix = string_store[string[-3:]]
|
||||||
lex.shape = string_store[orth.word_shape(string)]
|
lex.shape = string_store[orth.word_shape(string)]
|
||||||
lex.dense = lex.sic if lex.prob >= -10 else lex.shape
|
lex.dense = string_store[props['dense']]
|
||||||
lex.stem = string_store[props.get('stem', string)]
|
|
||||||
lex.asciied = string_store[orth.asciied(string)]
|
|
||||||
|
|
||||||
lex.flags = props.get('flags', 0)
|
lex.flags = props.get('flags', 0)
|
||||||
return lex
|
return lex
|
||||||
|
|
|
@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.learner cimport LinearModel
|
from thinc.learner cimport LinearModel
|
||||||
from thinc.features cimport Extractor
|
from thinc.features cimport Extractor
|
||||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
cpdef enum TagType:
|
|
||||||
POS
|
|
||||||
SENSE
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
cpdef int set_tags(self, Tokens tokens) except -1
|
cdef class_t predict(self, atom_t* context, object golds=*) except *
|
||||||
cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
|
|
||||||
|
|
||||||
cpdef readonly Pool mem
|
cpdef readonly Pool mem
|
||||||
cpdef readonly Extractor extractor
|
cpdef readonly Extractor extractor
|
||||||
cpdef readonly LinearModel model
|
cpdef readonly LinearModel model
|
||||||
|
|
||||||
cpdef readonly TagType tag_type
|
|
||||||
cpdef readonly list tag_names
|
cpdef readonly list tag_names
|
||||||
cdef dict tagdict
|
cdef dict tagdict
|
||||||
|
|
|
@ -2,9 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
from .context cimport fill_context
|
|
||||||
from .context cimport N_FIELDS
|
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -15,12 +12,11 @@ import cython
|
||||||
from thinc.features cimport Feature, count_feats
|
from thinc.features cimport Feature, count_feats
|
||||||
|
|
||||||
|
|
||||||
def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
|
def setup_model_dir(tag_names, tag_counts, templates, model_dir):
|
||||||
if path.exists(model_dir):
|
if path.exists(model_dir):
|
||||||
shutil.rmtree(model_dir)
|
shutil.rmtree(model_dir)
|
||||||
os.mkdir(model_dir)
|
os.mkdir(model_dir)
|
||||||
config = {
|
config = {
|
||||||
'tag_type': tag_type,
|
|
||||||
'templates': templates,
|
'templates': templates,
|
||||||
'tag_names': tag_names,
|
'tag_names': tag_names,
|
||||||
'tag_counts': tag_counts,
|
'tag_counts': tag_counts,
|
||||||
|
@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
|
||||||
json.dump(config, file_)
|
json.dump(config, file_)
|
||||||
|
|
||||||
|
|
||||||
def train(train_sents, model_dir, nr_iter=10):
|
|
||||||
cdef Tokens tokens
|
|
||||||
cdef Tagger tagger = Tagger(model_dir)
|
|
||||||
cdef int i
|
|
||||||
cdef class_t guess = 0
|
|
||||||
cdef class_t gold
|
|
||||||
for _ in range(nr_iter):
|
|
||||||
n_corr = 0
|
|
||||||
total = 0
|
|
||||||
for tokens, golds in train_sents:
|
|
||||||
assert len(tokens) == len(golds), [t.string for t in tokens]
|
|
||||||
for i in range(tokens.length):
|
|
||||||
gold = golds[i]
|
|
||||||
guess = tagger.predict(i, tokens, [gold])
|
|
||||||
tokens.set_tag(i, tagger.tag_type, guess)
|
|
||||||
total += 1
|
|
||||||
n_corr += guess == gold
|
|
||||||
print('%.4f' % ((n_corr / total) * 100))
|
|
||||||
random.shuffle(train_sents)
|
|
||||||
tagger.model.end_training()
|
|
||||||
tagger.model.dump(path.join(model_dir, 'model'))
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
||||||
decoding. The tagger reads its model and configuration from disk.
|
decoding. The tagger reads its model and configuration from disk.
|
||||||
|
@ -61,26 +34,13 @@ cdef class Tagger:
|
||||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||||
templates = cfg['templates']
|
templates = cfg['templates']
|
||||||
self.tag_names = cfg['tag_names']
|
self.tag_names = cfg['tag_names']
|
||||||
self.tag_type = cfg['tag_type']
|
|
||||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||||
self.extractor = Extractor(templates)
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||||
if path.exists(path.join(model_dir, 'model')):
|
if path.exists(path.join(model_dir, 'model')):
|
||||||
self.model.load(path.join(model_dir, 'model'))
|
self.model.load(path.join(model_dir, 'model'))
|
||||||
|
|
||||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
cdef class_t predict(self, atom_t* context, object golds=None) except *:
|
||||||
"""Assign tags to a Tokens object.
|
|
||||||
|
|
||||||
>>> tokens = EN.tokenize(u'An example sentence.')
|
|
||||||
>>> assert tokens[0].pos == 'NO_TAG'
|
|
||||||
>>> EN.pos_tagger.set_tags(tokens)
|
|
||||||
>>> assert tokens[0].pos == 'DT'
|
|
||||||
"""
|
|
||||||
cdef int i
|
|
||||||
for i in range(tokens.length):
|
|
||||||
tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
|
|
||||||
|
|
||||||
cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
|
|
||||||
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
"""Predict the tag of tokens[i]. The tagger remembers the features and
|
||||||
prediction, in case you later call tell_answer.
|
prediction, in case you later call tell_answer.
|
||||||
|
|
||||||
|
@ -88,11 +48,6 @@ cdef class Tagger:
|
||||||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||||
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
||||||
"""
|
"""
|
||||||
cdef atom_t sic = tokens.data[i].lex.sic
|
|
||||||
if sic in self.tagdict:
|
|
||||||
return self.tagdict[sic]
|
|
||||||
cdef atom_t[N_FIELDS] context
|
|
||||||
fill_context(context, i, tokens.data)
|
|
||||||
cdef int n_feats
|
cdef int n_feats
|
||||||
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
|
cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
|
||||||
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
|
cdef weight_t* scores = self.model.get_scores(feats, n_feats)
|
||||||
|
|
|
@ -140,11 +140,11 @@ cdef class Token:
|
||||||
self.cluster = lex['cluster']
|
self.cluster = lex['cluster']
|
||||||
self.length = lex['length']
|
self.length = lex['length']
|
||||||
self.postype = lex['pos_type']
|
self.postype = lex['pos_type']
|
||||||
self.sensetype = lex['sense_type']
|
self.sensetype = 0
|
||||||
self.sic = lex['sic']
|
self.sic = lex['sic']
|
||||||
self.norm = lex['dense']
|
self.norm = lex['dense']
|
||||||
self.shape = lex['shape']
|
self.shape = lex['shape']
|
||||||
self.suffix = lex['asciied']
|
self.suffix = lex['suffix']
|
||||||
self.prefix = lex['prefix']
|
self.prefix = lex['prefix']
|
||||||
|
|
||||||
self.prob = lex['prob']
|
self.prob = lex['prob']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user