mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-31 19:23:05 +03:00
* Add count_tags functionto pos.pyx, which should probably live in another file. Feature set achieves 97.9 on wsj19-21, 95.85 on onto web.
This commit is contained in:
parent
63114820cf
commit
f67cb9a5a3
|
@ -6,15 +6,17 @@ import ujson
|
||||||
import random
|
import random
|
||||||
import codecs
|
import codecs
|
||||||
import gzip
|
import gzip
|
||||||
|
import cython
|
||||||
|
|
||||||
|
from libc.stdint cimport uint32_t
|
||||||
|
|
||||||
|
|
||||||
from thinc.weights cimport arg_max
|
from thinc.weights cimport arg_max
|
||||||
from thinc.features import NonZeroConjFeat
|
from thinc.features import NonZeroConjFeat
|
||||||
from thinc.features import ConjFeat
|
from thinc.features import ConjFeat
|
||||||
|
|
||||||
from .en import EN
|
|
||||||
|
|
||||||
from .lexeme cimport *
|
from .lexeme cimport *
|
||||||
|
from .lang cimport Lexicon
|
||||||
|
|
||||||
|
|
||||||
NULL_TAG = 0
|
NULL_TAG = 0
|
||||||
|
@ -39,11 +41,9 @@ cdef class Tagger:
|
||||||
self._guess = NULL_TAG
|
self._guess = NULL_TAG
|
||||||
|
|
||||||
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
|
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
|
||||||
assert i >= 0
|
|
||||||
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
|
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
|
||||||
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
|
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
|
||||||
self.extractor.extract(self._feats, self._values, self._atoms, NULL)
|
self.extractor.extract(self._feats, self._values, self._atoms, NULL)
|
||||||
assert self._feats[self.extractor.n] == 0
|
|
||||||
self._guess = self.model.score(self._scores, self._feats, self._values)
|
self._guess = self.model.score(self._scores, self._feats, self._values)
|
||||||
return self._guess
|
return self._guess
|
||||||
|
|
||||||
|
@ -64,6 +64,21 @@ cdef class Tagger:
|
||||||
return cls.tags[tag]
|
return cls.tags[tag]
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
|
||||||
|
cdef class_t prev_prev, prev, tag
|
||||||
|
prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
|
||||||
|
cdef int i
|
||||||
|
cdef id_t token
|
||||||
|
for i in range(tokens.length):
|
||||||
|
tag = tagger.predict(i, tokens, prev, prev_prev)
|
||||||
|
prev_prev = prev
|
||||||
|
prev = tag
|
||||||
|
token = tokens.lex[i].id
|
||||||
|
if token < tag_counts.shape[0]:
|
||||||
|
tag_counts[token, tag] += 1
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
P2i
|
P2i
|
||||||
P2c
|
P2c
|
||||||
|
@ -73,6 +88,7 @@ cpdef enum:
|
||||||
P2suff
|
P2suff
|
||||||
P2oft_title
|
P2oft_title
|
||||||
P2oft_upper
|
P2oft_upper
|
||||||
|
P2pos
|
||||||
|
|
||||||
P1i
|
P1i
|
||||||
P1c
|
P1c
|
||||||
|
@ -82,6 +98,7 @@ cpdef enum:
|
||||||
P1suff
|
P1suff
|
||||||
P1oft_title
|
P1oft_title
|
||||||
P1oft_upper
|
P1oft_upper
|
||||||
|
P1pos
|
||||||
|
|
||||||
N0i
|
N0i
|
||||||
N0c
|
N0c
|
||||||
|
@ -91,6 +108,7 @@ cpdef enum:
|
||||||
N0suff
|
N0suff
|
||||||
N0oft_title
|
N0oft_title
|
||||||
N0oft_upper
|
N0oft_upper
|
||||||
|
N0pos
|
||||||
|
|
||||||
N1i
|
N1i
|
||||||
N1c
|
N1c
|
||||||
|
@ -100,6 +118,7 @@ cpdef enum:
|
||||||
N1suff
|
N1suff
|
||||||
N1oft_title
|
N1oft_title
|
||||||
N1oft_upper
|
N1oft_upper
|
||||||
|
N1pos
|
||||||
|
|
||||||
N2i
|
N2i
|
||||||
N2c
|
N2c
|
||||||
|
@ -109,6 +128,7 @@ cpdef enum:
|
||||||
N2suff
|
N2suff
|
||||||
N2oft_title
|
N2oft_title
|
||||||
N2oft_upper
|
N2oft_upper
|
||||||
|
N2pos
|
||||||
|
|
||||||
P2t
|
P2t
|
||||||
P1t
|
P1t
|
||||||
|
@ -137,6 +157,7 @@ cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
|
||||||
|
|
||||||
atoms[6] = lex.flags & (1 << OFT_TITLE)
|
atoms[6] = lex.flags & (1 << OFT_TITLE)
|
||||||
atoms[7] = lex.flags & (1 << OFT_UPPER)
|
atoms[7] = lex.flags & (1 << OFT_UPPER)
|
||||||
|
atoms[8] = lex.postype
|
||||||
|
|
||||||
|
|
||||||
TEMPLATES = (
|
TEMPLATES = (
|
||||||
|
@ -163,4 +184,15 @@ TEMPLATES = (
|
||||||
(P2c,),
|
(P2c,),
|
||||||
(N0oft_upper,),
|
(N0oft_upper,),
|
||||||
(N0oft_title,),
|
(N0oft_title,),
|
||||||
|
|
||||||
|
(P1t, N1w),
|
||||||
|
(P1t, P2t, N1w),
|
||||||
|
(P1w, P2w, N1w),
|
||||||
|
(P2w, N1w, N2w),
|
||||||
|
|
||||||
|
(N0pos,),
|
||||||
|
(N0w, N1pos),
|
||||||
|
(N0w, N1pos, N2pos),
|
||||||
|
(P1t, N0pos),
|
||||||
|
(P2t, P1t, N0pos)
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user