mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Add is_oov property, and fix up handling of attributes
This commit is contained in:
parent
fc268f03eb
commit
8e4c69ee8c
|
@ -12,19 +12,8 @@ cpdef enum attr_id_t:
|
|||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
FLAG3
|
||||
FLAG4
|
||||
FLAG5
|
||||
FLAG6
|
||||
FLAG7
|
||||
FLAG8
|
||||
FLAG9
|
||||
FLAG10
|
||||
FLAG11
|
||||
FLAG12
|
||||
IS_OOV
|
||||
|
||||
FLAG13
|
||||
FLAG14
|
||||
FLAG15
|
||||
|
|
|
@ -25,9 +25,9 @@ from ..util import read_lang_data
|
|||
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
|
||||
|
||||
def get_lex_props(string, oov_prob=-30):
|
||||
def get_lex_props(string, oov_prob=-30, is_oov=False):
|
||||
return {
|
||||
'flags': get_flags(string),
|
||||
'flags': get_flags(string, is_oov=is_oov),
|
||||
'length': len(string),
|
||||
'orth': string,
|
||||
'lower': string.lower(),
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14
|
||||
from ..attrs cimport FLAG13, FLAG14
|
||||
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
|
||||
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
|
||||
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
|
||||
|
@ -10,6 +10,7 @@ from ..attrs cimport IS_PUNCT as _IS_PUNCT
|
|||
from ..attrs cimport IS_SPACE as _IS_SPACE
|
||||
from ..attrs cimport IS_TITLE as _IS_TITLE
|
||||
from ..attrs cimport IS_UPPER as _IS_UPPER
|
||||
from ..attrs cimport IS_OOV as _IS_OOV
|
||||
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
|
||||
from ..attrs cimport LIKE_URL as _LIKE_URL
|
||||
from ..attrs cimport LIKE_NUM as _LIKE_NUM
|
||||
|
@ -43,6 +44,7 @@ cpdef enum:
|
|||
LIKE_NUM = _LIKE_NUM
|
||||
LIKE_EMAIL = _LIKE_EMAIL
|
||||
IS_STOP = _IS_STOP
|
||||
IS_OOV = _IS_OOV
|
||||
|
||||
ORTH = _ORTH
|
||||
SHAPE = _SHAPE
|
||||
|
|
|
@ -11,6 +11,7 @@ import numpy
|
|||
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_OOV
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
@ -51,6 +52,9 @@ cdef class Lexeme:
|
|||
cdef flags_t one = 1
|
||||
return self.flags & (one << flag_id)
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return self.check_flag(IS_OOV)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return self.check_flag(IS_ALPHA)
|
||||
|
||||
|
|
|
@ -18,8 +18,7 @@ from ..parts_of_speech cimport CONJ, PUNCT
|
|||
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
|
||||
|
||||
from ..attrs cimport IS_OOV
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
@ -286,7 +285,9 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
|
||||
|
||||
property is_oov:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_OOV)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ cdef class Vocab:
|
|||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
props = self.lexeme_props_getter(string, self.oov_prob)
|
||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
|
@ -119,7 +119,7 @@ cdef class Vocab:
|
|||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||
props = self.lexeme_props_getter(string)
|
||||
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||
if is_oov:
|
||||
lex.id = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user