* Add is_oov property, and fix up handling of attributes

This commit is contained in:
Matthew Honnibal 2015-07-27 01:50:06 +02:00
parent fc268f03eb
commit 8e4c69ee8c
6 changed files with 17 additions and 21 deletions

View File

@ -12,19 +12,8 @@ cpdef enum attr_id_t:
LIKE_NUM
LIKE_EMAIL
IS_STOP
FLAG0
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
IS_OOV
FLAG13
FLAG14
FLAG15

View File

@ -25,9 +25,9 @@ from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string, oov_prob=-30):
def get_lex_props(string, oov_prob=-30, is_oov=False):
return {
'flags': get_flags(string),
'flags': get_flags(string, is_oov=is_oov),
'length': len(string),
'orth': string,
'lower': string.lower(),

View File

@ -1,4 +1,4 @@
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14
from ..attrs cimport FLAG13, FLAG14
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
@ -10,6 +10,7 @@ from ..attrs cimport IS_PUNCT as _IS_PUNCT
from ..attrs cimport IS_SPACE as _IS_SPACE
from ..attrs cimport IS_TITLE as _IS_TITLE
from ..attrs cimport IS_UPPER as _IS_UPPER
from ..attrs cimport IS_OOV as _IS_OOV
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
from ..attrs cimport LIKE_URL as _LIKE_URL
from ..attrs cimport LIKE_NUM as _LIKE_NUM
@ -43,6 +44,7 @@ cpdef enum:
LIKE_NUM = _LIKE_NUM
LIKE_EMAIL = _LIKE_EMAIL
IS_STOP = _IS_STOP
IS_OOV = _IS_OOV
ORTH = _ORTH
SHAPE = _SHAPE

View File

@ -11,6 +11,7 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_OOV
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -51,6 +52,9 @@ cdef class Lexeme:
cdef flags_t one = 1
return self.flags & (one << flag_id)
property is_oov:
def __get__(self): return self.check_flag(IS_OOV)
property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA)

View File

@ -18,8 +18,7 @@ from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
cdef class Token:
@ -286,7 +285,9 @@ cdef class Token:
def __get__(self):
return self.vocab.strings[self.c.dep]
property is_oov:
def __get__(self): return check_flag(self.c.lex, IS_OOV)
property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)

View File

@ -95,7 +95,7 @@ cdef class Vocab:
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
@ -119,7 +119,7 @@ cdef class Vocab:
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string)
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0