mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-02 02:43:36 +03:00
* Add is_oov property, and fix up handling of attributes
This commit is contained in:
parent
fc268f03eb
commit
8e4c69ee8c
|
@ -12,19 +12,8 @@ cpdef enum attr_id_t:
|
||||||
LIKE_NUM
|
LIKE_NUM
|
||||||
LIKE_EMAIL
|
LIKE_EMAIL
|
||||||
IS_STOP
|
IS_STOP
|
||||||
FLAG0
|
IS_OOV
|
||||||
FLAG1
|
|
||||||
FLAG2
|
|
||||||
FLAG3
|
|
||||||
FLAG4
|
|
||||||
FLAG5
|
|
||||||
FLAG6
|
|
||||||
FLAG7
|
|
||||||
FLAG8
|
|
||||||
FLAG9
|
|
||||||
FLAG10
|
|
||||||
FLAG11
|
|
||||||
FLAG12
|
|
||||||
FLAG13
|
FLAG13
|
||||||
FLAG14
|
FLAG14
|
||||||
FLAG15
|
FLAG15
|
||||||
|
|
|
@ -25,9 +25,9 @@ from ..util import read_lang_data
|
||||||
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string, oov_prob=-30):
|
def get_lex_props(string, oov_prob=-30, is_oov=False):
|
||||||
return {
|
return {
|
||||||
'flags': get_flags(string),
|
'flags': get_flags(string, is_oov=is_oov),
|
||||||
'length': len(string),
|
'length': len(string),
|
||||||
'orth': string,
|
'orth': string,
|
||||||
'lower': string.lower(),
|
'lower': string.lower(),
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14
|
from ..attrs cimport FLAG13, FLAG14
|
||||||
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
|
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
|
||||||
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
|
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
|
||||||
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
|
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
|
||||||
|
@ -10,6 +10,7 @@ from ..attrs cimport IS_PUNCT as _IS_PUNCT
|
||||||
from ..attrs cimport IS_SPACE as _IS_SPACE
|
from ..attrs cimport IS_SPACE as _IS_SPACE
|
||||||
from ..attrs cimport IS_TITLE as _IS_TITLE
|
from ..attrs cimport IS_TITLE as _IS_TITLE
|
||||||
from ..attrs cimport IS_UPPER as _IS_UPPER
|
from ..attrs cimport IS_UPPER as _IS_UPPER
|
||||||
|
from ..attrs cimport IS_OOV as _IS_OOV
|
||||||
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
|
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
|
||||||
from ..attrs cimport LIKE_URL as _LIKE_URL
|
from ..attrs cimport LIKE_URL as _LIKE_URL
|
||||||
from ..attrs cimport LIKE_NUM as _LIKE_NUM
|
from ..attrs cimport LIKE_NUM as _LIKE_NUM
|
||||||
|
@ -43,6 +44,7 @@ cpdef enum:
|
||||||
LIKE_NUM = _LIKE_NUM
|
LIKE_NUM = _LIKE_NUM
|
||||||
LIKE_EMAIL = _LIKE_EMAIL
|
LIKE_EMAIL = _LIKE_EMAIL
|
||||||
IS_STOP = _IS_STOP
|
IS_STOP = _IS_STOP
|
||||||
|
IS_OOV = _IS_OOV
|
||||||
|
|
||||||
ORTH = _ORTH
|
ORTH = _ORTH
|
||||||
SHAPE = _SHAPE
|
SHAPE = _SHAPE
|
||||||
|
|
|
@ -11,6 +11,7 @@ import numpy
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
|
from .attrs cimport IS_OOV
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
@ -51,6 +52,9 @@ cdef class Lexeme:
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
return self.flags & (one << flag_id)
|
return self.flags & (one << flag_id)
|
||||||
|
|
||||||
|
property is_oov:
|
||||||
|
def __get__(self): return self.check_flag(IS_OOV)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return self.check_flag(IS_ALPHA)
|
def __get__(self): return self.check_flag(IS_ALPHA)
|
||||||
|
|
||||||
|
|
|
@ -18,8 +18,7 @@ from ..parts_of_speech cimport CONJ, PUNCT
|
||||||
|
|
||||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
|
@ -286,7 +285,9 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
|
property is_oov:
|
||||||
|
def __get__(self): return check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
||||||
|
|
||||||
|
|
|
@ -95,7 +95,7 @@ cdef class Vocab:
|
||||||
if len(string) < 3:
|
if len(string) < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
props = self.lexeme_props_getter(string, self.oov_prob)
|
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
if is_oov:
|
if is_oov:
|
||||||
lex.id = 0
|
lex.id = 0
|
||||||
|
@ -119,7 +119,7 @@ cdef class Vocab:
|
||||||
if len(string) < 3:
|
if len(string) < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
props = self.lexeme_props_getter(string)
|
props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov)
|
||||||
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
|
||||||
if is_oov:
|
if is_oov:
|
||||||
lex.id = 0
|
lex.id = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user