mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Generalize tagger code, in preparation for NER and supersense tagging.
This commit is contained in:
parent
81da61f3cf
commit
3733444101
|
@ -6,7 +6,7 @@ from cymem.cymem cimport Pool
|
|||
from .typedefs cimport hash_t
|
||||
from .tokens cimport Tokens
|
||||
from .lexeme cimport Lexeme
|
||||
from .pos cimport Tagger as PosTagger
|
||||
from .tagger cimport Tagger
|
||||
from .utf8string cimport StringStore
|
||||
|
||||
|
||||
|
@ -41,14 +41,13 @@ cdef class Language:
|
|||
cdef PreshMap _specials
|
||||
cpdef readonly Lexicon lexicon
|
||||
|
||||
cpdef readonly PosTagger pos_tagger
|
||||
cpdef readonly Tagger pos_tagger
|
||||
|
||||
cdef object _prefix_re
|
||||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
cpdef Tokens pos_tag(self, Tokens t)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||
|
|
|
@ -23,7 +23,7 @@ from . import util
|
|||
from .util import read_lang_data
|
||||
from .tokens import Tokens
|
||||
|
||||
from .pos cimport Tagger as PosTagger
|
||||
from .tagger cimport Tagger
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -42,7 +42,7 @@ cdef class Language:
|
|||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||
self._load_special_tokenization(rules)
|
||||
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
|
||||
self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos'))
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
|
||||
else:
|
||||
self.pos_tagger = None
|
||||
|
||||
|
@ -93,16 +93,6 @@ cdef class Language:
|
|||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cpdef Tokens pos_tag(self, Tokens t):
|
||||
if self.pos_tagger is None:
|
||||
return t
|
||||
cdef int i
|
||||
t.pos[-1] = self.pos_tagger.encode_pos('EOL')
|
||||
t.pos[-2] = self.pos_tagger.encode_pos('EOL')
|
||||
for i in range(t.length):
|
||||
t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
|
||||
return t
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||
|
||||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cpdef readonly Extractor extractor
|
||||
cpdef readonly LinearModel model
|
||||
|
||||
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0
|
||||
cpdef bint tell_answer(self, class_t gold_tag) except *
|
||||
|
||||
cdef Pool mem
|
||||
cdef class_t _guess
|
||||
cdef atom_t* _atoms
|
||||
cdef feat_t* _feats
|
||||
cdef weight_t* _values
|
||||
cdef weight_t* _scores
|
|
@ -30,7 +30,7 @@ cdef class Tagger:
|
|||
if path.exists(tags_loc):
|
||||
with open(tags_loc) as file_:
|
||||
Tagger.tags.update(ujson.load(file_))
|
||||
self.model = LinearModel(len(self.tags), self.extractor.n)
|
||||
self.model = LinearModel(len(self.tags))
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
|
||||
|
|
83
spacy/pos_feats.pxd
Normal file
83
spacy/pos_feats.pxd
Normal file
|
@ -0,0 +1,83 @@
|
|||
from .tokens cimport Tokens
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2i
|
||||
P2c
|
||||
P2w
|
||||
P2shape
|
||||
P2pref
|
||||
P2suff
|
||||
P2title
|
||||
P2upper
|
||||
P2oft_title
|
||||
P2oft_upper
|
||||
P2pos
|
||||
P2url
|
||||
P2num
|
||||
|
||||
P1i
|
||||
P1c
|
||||
P1w
|
||||
P1shape
|
||||
P1pre
|
||||
P1suff
|
||||
P1title
|
||||
P1upper
|
||||
P1oft_title
|
||||
P1oft_upper
|
||||
P1pos
|
||||
P1url
|
||||
P1num
|
||||
|
||||
N0i
|
||||
N0c
|
||||
N0w
|
||||
N0shape
|
||||
N0pref
|
||||
N0suff
|
||||
N0title
|
||||
N0upper
|
||||
N0oft_title
|
||||
N0oft_upper
|
||||
N0pos
|
||||
N0url
|
||||
N0num
|
||||
|
||||
N1i
|
||||
N1c
|
||||
N1w
|
||||
N1shape
|
||||
N1pref
|
||||
N1suff
|
||||
N1title
|
||||
N1upper
|
||||
N1oft_title
|
||||
N1oft_upper
|
||||
N1pos
|
||||
N1url
|
||||
N1num
|
||||
|
||||
N2i
|
||||
N2c
|
||||
N2w
|
||||
N2shape
|
||||
N2pref
|
||||
N2suff
|
||||
N2title
|
||||
N2upper
|
||||
N2oft_title
|
||||
N2oft_upper
|
||||
N2pos
|
||||
N2url
|
||||
N2num
|
||||
|
||||
P2t
|
||||
P1t
|
||||
|
||||
CONTEXT_SIZE
|
||||
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
|
77
spacy/pos_feats.pyx
Normal file
77
spacy/pos_feats.pyx
Normal file
|
@ -0,0 +1,77 @@
|
|||
from .lexeme cimport *
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
|
||||
TEMPLATES = (
|
||||
(N0i,),
|
||||
(N0w,),
|
||||
(N0suff,),
|
||||
(N0pref,),
|
||||
(P1t,),
|
||||
(P2t,),
|
||||
(P1t, P2t),
|
||||
(P1t, N0w),
|
||||
(P1w,),
|
||||
(P1suff,),
|
||||
(P2w,),
|
||||
(N1w,),
|
||||
(N1suff,),
|
||||
(N2w,),
|
||||
|
||||
(N0shape,),
|
||||
(N0c,),
|
||||
(N1c,),
|
||||
(N2c,),
|
||||
(P1c,),
|
||||
(P2c,),
|
||||
(P1c, N0c),
|
||||
(N0c, N1c),
|
||||
(P1c, P1t),
|
||||
(P1c, P1t, N0c),
|
||||
(P1t, N0c),
|
||||
(N0oft_upper,),
|
||||
(N0oft_title,),
|
||||
|
||||
(P1w, N0w),
|
||||
(N0w, N1w),
|
||||
|
||||
(N0pos,),
|
||||
(P1t, N0pos, N1pos),
|
||||
(P1t, N1pos),
|
||||
|
||||
(N0url,),
|
||||
(N0num,),
|
||||
(P1url,),
|
||||
(P1url,),
|
||||
(N1num,),
|
||||
(N1url,),
|
||||
)
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
|
||||
_fill_token(&context[P2i], tokens.lex[i-2])
|
||||
_fill_token(&context[P1i], tokens.lex[i-1])
|
||||
_fill_token(&context[N0i], tokens.lex[i])
|
||||
_fill_token(&context[N1i], tokens.lex[i+1])
|
||||
_fill_token(&context[N2i], tokens.lex[i+2])
|
||||
context[P1t] = tokens.pos[i-1]
|
||||
context[P2t] = tokens.pos[i-2]
|
||||
|
||||
|
||||
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||
atoms[0] = lex.sic
|
||||
atoms[1] = lex.cluster
|
||||
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
||||
atoms[3] = lex.shape
|
||||
atoms[4] = lex.prefix
|
||||
atoms[5] = lex.suffix
|
||||
|
||||
atoms[6] = lex.flags & (1 << IS_TITLE)
|
||||
atoms[7] = lex.flags & (1 << IS_UPPER)
|
||||
atoms[8] = lex.flags & (1 << OFT_TITLE)
|
||||
atoms[9] = lex.flags & (1 << OFT_UPPER)
|
||||
atoms[10] = lex.postype
|
||||
atoms[11] = lex.flags & (1 << LIKE_URL)
|
||||
atoms[12] = lex.flags & (1 << LIKE_NUMBER)
|
||||
|
|
@ -6,9 +6,10 @@ from .en import EN
|
|||
from .pos import Tagger
|
||||
|
||||
|
||||
def read_gold(file_):
|
||||
def read_gold(file_, tag_list):
|
||||
paras = file_.read().strip().split('\n\n')
|
||||
golds = []
|
||||
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
|
||||
for para in paras:
|
||||
if not para.strip():
|
||||
continue
|
||||
|
@ -32,10 +33,16 @@ def read_gold(file_):
|
|||
else:
|
||||
conll_toks.pop(0)
|
||||
assert len(tags) == len(tokens)
|
||||
tags = [Tagger.encode_pos(t) for t in tags]
|
||||
tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
|
||||
golds.append((tokens, tags))
|
||||
return golds
|
||||
|
||||
def _encode_pos(tag, tag_ids, tag_list):
|
||||
if tag not in tag_ids:
|
||||
tag_ids[tag] = len(tag_list)
|
||||
tag_list.append(tag)
|
||||
return tag_ids[tag]
|
||||
|
||||
|
||||
def ptb_to_univ(tag):
|
||||
mapping = dict(tuple(line.split()) for line in """
|
||||
|
|
|
@ -7,7 +7,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
|||
from .tokens cimport Tokens
|
||||
|
||||
|
||||
cdef enum TagType:
|
||||
cpdef enum TagType:
|
||||
POS
|
||||
ENTITY
|
||||
SENSE
|
||||
|
|
|
@ -1,37 +1,93 @@
|
|||
# cython: profile=True
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import division
|
||||
|
||||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import codecs
|
||||
import gzip
|
||||
import json
|
||||
import cython
|
||||
|
||||
from .pos_feats cimport fill_context as pos_fill_context
|
||||
from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
|
||||
|
||||
from thinc.features cimport ConjFeat
|
||||
|
||||
|
||||
NULL_TAG = 0
|
||||
|
||||
|
||||
def setup_model_dir(tag_type, tag_names, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'tag_type': tag_type,
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
}
|
||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||
json.dump(config, file_)
|
||||
|
||||
|
||||
def train(train_sents, model_dir, nr_iter=5):
|
||||
tagger = Tagger(model_dir)
|
||||
for _ in range(nr_iter):
|
||||
n_corr = 0
|
||||
total = 0
|
||||
for tokens, golds in train_sents:
|
||||
assert len(tokens) == len(golds), [t.string for t in tokens]
|
||||
for i, gold in enumerate(golds):
|
||||
guess = tagger.predict(i, tokens)
|
||||
tokens.set_tag(i, tagger.tag_type, guess)
|
||||
tagger.tell_answer(gold)
|
||||
if gold != NULL_TAG:
|
||||
total += 1
|
||||
n_corr += guess == gold
|
||||
print('%.4f' % ((n_corr / total) * 100))
|
||||
random.shuffle(train_sents)
|
||||
tagger.model.end_training()
|
||||
tagger.model.dump(path.join(model_dir, 'model'), freq_thresh=10)
|
||||
|
||||
|
||||
def evaluate(tagger, sents):
|
||||
n_corr = 0
|
||||
total = 0
|
||||
for tokens, golds in sents:
|
||||
for i, gold in enumerate(golds):
|
||||
guess = tagger.predict(i, tokens)
|
||||
tokens.set_tag(i, tagger.tag_type, guess)
|
||||
if gold != NULL_TAG:
|
||||
total += 1
|
||||
n_corr += guess == gold
|
||||
return n_corr / total
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Assign part-of-speech, named entity or supersense tags, using greedy
|
||||
decoding. The tagger reads its model and configuration from disk.
|
||||
"""
|
||||
def __init__(self, model_dir):
|
||||
self.mem = Pool()
|
||||
cfg = json.load(path.join(model_dir, 'config.json'))
|
||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||
templates = cfg['templates']
|
||||
self.tag_names = cfg['tag_names']
|
||||
self.tag_type = cfg['tag_type']
|
||||
self.model = LinearModel(len(self.tag_names))
|
||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
||||
self.model = LinearModel(len(self.tag_names), self.extractor.n)
|
||||
print("Load tagger model")
|
||||
if path.exists(path.join(model_dir, 'model')):
|
||||
self.model.load(path.join(model_dir, 'model'))
|
||||
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
|
||||
print("Done")
|
||||
|
||||
if self.tag_type == POS:
|
||||
n_context = POS_CONTEXT_SIZE
|
||||
self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
|
||||
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
|
||||
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
|
||||
self._scores = <weight_t*>self.mem.alloc(len(self.cfg.tags), sizeof(weight_t))
|
||||
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
|
||||
self._guess = NULL_TAG
|
||||
|
||||
cpdef int set_tags(self, Tokens tokens) except -1:
|
||||
|
@ -54,8 +110,10 @@ cdef class Tagger:
|
|||
>>> tag = EN.pos_tagger.predict(0, tokens)
|
||||
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
|
||||
"""
|
||||
#if self.tag_type == POS:
|
||||
# _pos_feats.fill_context(self._context, i, tokens)
|
||||
if self.tag_type == POS:
|
||||
pos_fill_context(self._context, i, tokens)
|
||||
else:
|
||||
raise StandardError
|
||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||
self._guess = self.model.score(self._scores, self._feats, self._values)
|
||||
return self._guess
|
||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport flag_t
|
||||
from .utf8string cimport StringStore
|
||||
from .tagger cimport TagType
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
|
@ -23,6 +24,7 @@ cdef class Tokens:
|
|||
|
||||
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
|
||||
cdef int push_back(self, int i, Lexeme* lexeme) except -1
|
||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
|
|
@ -4,6 +4,7 @@ cimport cython
|
|||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
cdef int bounds_check(int i, int length, int padding) except -1:
|
||||
if (i + padding) < 0:
|
||||
raise IndexError
|
||||
|
@ -89,6 +90,9 @@ cdef class Tokens:
|
|||
idx = self.push_back(idx, lexemes[i])
|
||||
return idx
|
||||
|
||||
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
|
||||
self.pos[i] = tag
|
||||
|
||||
def _realloc(self, new_size):
|
||||
self.max_length = new_size
|
||||
n = new_size + (PADDING * 2)
|
||||
|
@ -130,4 +134,3 @@ cdef class Token:
|
|||
return ''
|
||||
cdef bytes utf8string = self._string_store[self.sic]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
|
|
|
@ -6,5 +6,3 @@ ctypedef uint64_t flag_t
|
|||
ctypedef uint32_t id_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user