* Add is_oov property, and fix up handling of attributes

This commit is contained in:
Matthew Honnibal 2015-07-27 01:50:06 +02:00
parent fc268f03eb
commit 8e4c69ee8c
6 changed files with 17 additions and 21 deletions

View File

@ -12,19 +12,8 @@ cpdef enum attr_id_t:
LIKE_NUM LIKE_NUM
LIKE_EMAIL LIKE_EMAIL
IS_STOP IS_STOP
FLAG0 IS_OOV
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13 FLAG13
FLAG14 FLAG14
FLAG15 FLAG15

View File

@ -25,9 +25,9 @@ from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string, oov_prob=-30): def get_lex_props(string, oov_prob=-30, is_oov=False):
return { return {
'flags': get_flags(string), 'flags': get_flags(string, is_oov=is_oov),
'length': len(string), 'length': len(string),
'orth': string, 'orth': string,
'lower': string.lower(), 'lower': string.lower(),

View File

@ -1,4 +1,4 @@
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14 from ..attrs cimport FLAG13, FLAG14
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21 from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28 from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32 from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
@ -10,6 +10,7 @@ from ..attrs cimport IS_PUNCT as _IS_PUNCT
from ..attrs cimport IS_SPACE as _IS_SPACE from ..attrs cimport IS_SPACE as _IS_SPACE
from ..attrs cimport IS_TITLE as _IS_TITLE from ..attrs cimport IS_TITLE as _IS_TITLE
from ..attrs cimport IS_UPPER as _IS_UPPER from ..attrs cimport IS_UPPER as _IS_UPPER
from ..attrs cimport IS_OOV as _IS_OOV
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
from ..attrs cimport LIKE_URL as _LIKE_URL from ..attrs cimport LIKE_URL as _LIKE_URL
from ..attrs cimport LIKE_NUM as _LIKE_NUM from ..attrs cimport LIKE_NUM as _LIKE_NUM
@ -43,6 +44,7 @@ cpdef enum:
LIKE_NUM = _LIKE_NUM LIKE_NUM = _LIKE_NUM
LIKE_EMAIL = _LIKE_EMAIL LIKE_EMAIL = _LIKE_EMAIL
IS_STOP = _IS_STOP IS_STOP = _IS_STOP
IS_OOV = _IS_OOV
ORTH = _ORTH ORTH = _ORTH
SHAPE = _SHAPE SHAPE = _SHAPE

View File

@ -11,6 +11,7 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_OOV
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -51,6 +52,9 @@ cdef class Lexeme:
cdef flags_t one = 1 cdef flags_t one = 1
return self.flags & (one << flag_id) return self.flags & (one << flag_id)
property is_oov:
def __get__(self): return self.check_flag(IS_OOV)
property is_alpha: property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA) def __get__(self): return self.check_flag(IS_ALPHA)

View File

@ -18,8 +18,7 @@ from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
cdef class Token: cdef class Token:
@ -286,7 +285,9 @@ cdef class Token:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]
property is_oov:
def __get__(self): return check_flag(self.c.lex, IS_OOV)
property is_alpha: property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA) def __get__(self): return check_flag(self.c.lex, IS_ALPHA)

View File

@ -95,7 +95,7 @@ cdef class Vocab:
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string, self.oov_prob) props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov: if is_oov:
lex.id = 0 lex.id = 0
@ -119,7 +119,7 @@ cdef class Vocab:
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string) props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov: if is_oov:
lex.id = 0 lex.id = 0