mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Work on language-independent refactoring
This commit is contained in:
parent
3879d28457
commit
6f1743692a
|
@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False):
|
||||||
'sentiment': 0
|
'sentiment': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get_lex_attr = {}
|
||||||
|
|
||||||
if_model_present = -1
|
if_model_present = -1
|
||||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE
|
||||||
|
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
from numpy cimport ndarray
|
from numpy cimport ndarray
|
||||||
|
|
||||||
|
@ -15,7 +16,8 @@ cdef class Lexeme:
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef readonly attr_t orth
|
cdef readonly attr_t orth
|
||||||
|
|
||||||
cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
@staticmethod
|
||||||
|
cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1:
|
||||||
lex.length = props['length']
|
lex.length = props['length']
|
||||||
lex.orth = vocab.strings[props['orth']]
|
lex.orth = vocab.strings[props['orth']]
|
||||||
lex.lower = vocab.strings[props['lower']]
|
lex.lower = vocab.strings[props['lower']]
|
||||||
|
@ -29,7 +31,6 @@ cdef class Lexeme:
|
||||||
lex.sentiment = props['sentiment']
|
lex.sentiment = props['sentiment']
|
||||||
|
|
||||||
lex.flags = props['flags']
|
lex.flags = props['flags']
|
||||||
lex.repvec = empty_vec
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
|
@ -55,6 +56,34 @@ cdef class Lexeme:
|
||||||
return lex.cluster
|
return lex.cluster
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||||
|
if name < (sizeof(flags_t) * 8):
|
||||||
|
Lexeme.set_flag(lex, name, value)
|
||||||
|
elif name == ID:
|
||||||
|
lex.id = value
|
||||||
|
elif name == LOWER:
|
||||||
|
lex.lower = value
|
||||||
|
elif name == NORM:
|
||||||
|
lex.norm = value
|
||||||
|
elif name == SHAPE:
|
||||||
|
lex.shape = value
|
||||||
|
elif name == PREFIX:
|
||||||
|
lex.prefix = value
|
||||||
|
elif name == SUFFIX:
|
||||||
|
lex.suffix = value
|
||||||
|
elif name == CLUSTER:
|
||||||
|
lex.cluster = value
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.flags & (1 << flag_id)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil:
|
||||||
|
cdef flags_t one = 1
|
||||||
|
if value:
|
||||||
|
lex.flags |= one << flag_id
|
||||||
|
else:
|
||||||
|
lex.flags &= ~(one << flag_id)
|
||||||
|
|
|
@ -26,12 +26,9 @@ cdef class Lexeme:
|
||||||
def __init__(self, Vocab vocab, int orth):
|
def __init__(self, Vocab vocab, int orth):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.orth = orth
|
self.orth = orth
|
||||||
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
|
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||||
|
assert self.c.orth == orth
|
||||||
|
|
||||||
property orth:
|
|
||||||
def __get__(self):
|
|
||||||
return self.c.orth
|
|
||||||
|
|
||||||
property lower:
|
property lower:
|
||||||
def __get__(self): return self.c.lower
|
def __get__(self): return self.c.lower
|
||||||
def __set__(self, int x): self.c.lower = x
|
def __set__(self, int x): self.c.lower = x
|
||||||
|
@ -113,7 +110,7 @@ cdef class Lexeme:
|
||||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
def __get__(self): return Lexeme.like_num(self.c, IKE_NUM)
|
def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM)
|
||||||
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
|
|
|
@ -103,20 +103,21 @@ cdef class Matcher:
|
||||||
|
|
||||||
def __init__(self, vocab, patterns):
|
def __init__(self, vocab, patterns):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self.vocab = vocab
|
||||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||||
self.add(entity_key, etype, attrs, specs)
|
self.add(entity_key, etype, attrs, specs)
|
||||||
|
|
||||||
def add(self, entity_key, etype, attrs, specs):
|
def add(self, entity_key, etype, attrs, specs):
|
||||||
if isinstance(entity_key, basestring):
|
if isinstance(entity_key, basestring):
|
||||||
entity_key = vocab.strings[entity_key]
|
entity_key = self.vocab.strings[entity_key]
|
||||||
if isinstance(etype, basestring):
|
if isinstance(etype, basestring):
|
||||||
etype = vocab.strings[etype]
|
etype = self.vocab.strings[etype]
|
||||||
elif etype is None:
|
elif etype is None:
|
||||||
etype = -1
|
etype = -1
|
||||||
# TODO: Do something more clever about multiple patterns for single
|
# TODO: Do something more clever about multiple patterns for single
|
||||||
# entity
|
# entity
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
spec = _convert_strings(spec, vocab.strings)
|
spec = _convert_strings(spec, self.vocab.strings)
|
||||||
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
self.patterns.push_back(init_pattern(self.mem, spec, etype))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -92,6 +92,7 @@ cpdef bint like_url(unicode string):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: This should live in the language.orth
|
||||||
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||||
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
|
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
|
||||||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||||
|
|
|
@ -142,6 +142,8 @@ cdef class StringStore:
|
||||||
def load(self, loc):
|
def load(self, loc):
|
||||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
with codecs.open(loc, 'r', 'utf8') as file_:
|
||||||
strings = file_.read().split(SEPARATOR)
|
strings = file_.read().split(SEPARATOR)
|
||||||
|
if strings == ['']:
|
||||||
|
return None
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
for string in strings:
|
for string in strings:
|
||||||
|
|
|
@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
|
||||||
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
from ..parts_of_speech import UNIV_POS_NAMES
|
from ..parts_of_speech import UNIV_POS_NAMES
|
||||||
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
from ..parts_of_speech cimport CONJ, PUNCT, NOUN
|
||||||
from ..lexeme cimport check_flag
|
from ..lexeme cimport Lexeme
|
||||||
from ..lexeme cimport get_attr as get_lex_attr
|
|
||||||
from .spans cimport Span
|
from .spans cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..serialize.bits cimport BitArray
|
from ..serialize.bits cimport BitArray
|
||||||
|
@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
elif feat_name == ENT_TYPE:
|
elif feat_name == ENT_TYPE:
|
||||||
return token.ent_type
|
return token.ent_type
|
||||||
else:
|
else:
|
||||||
return get_lex_attr(token.lex, feat_name)
|
return Lexeme.get_struct_attr(token.lex, feat_name)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
@ -218,6 +217,7 @@ cdef class Doc:
|
||||||
t.idx = 0
|
t.idx = 0
|
||||||
else:
|
else:
|
||||||
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||||
|
assert t.lex.orth != 0
|
||||||
t.spacy = has_space
|
t.spacy = has_space
|
||||||
self.length += 1
|
self.length += 1
|
||||||
self._py_tokens.append(None)
|
self._py_tokens.append(None)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
from ..lexeme cimport check_flag
|
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
from cython.view cimport array as cvarray
|
from cython.view cimport array as cvarray
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||||
from ..attrs cimport IS_OOV
|
from ..attrs cimport IS_OOV
|
||||||
|
|
||||||
|
from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
|
@ -42,7 +43,7 @@ cdef class Token:
|
||||||
return self.string
|
return self.string
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
return check_flag(self.c.lex, flag_id)
|
return Lexeme.check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
@ -286,37 +287,37 @@ cdef class Token:
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
property is_oov:
|
property is_oov:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_OOV)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV)
|
||||||
|
|
||||||
property is_alpha:
|
property is_alpha:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA)
|
||||||
|
|
||||||
property is_ascii:
|
property is_ascii:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_ASCII)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII)
|
||||||
|
|
||||||
property is_digit:
|
property is_digit:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT)
|
||||||
|
|
||||||
property is_lower:
|
property is_lower:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_LOWER)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER)
|
||||||
|
|
||||||
property is_title:
|
property is_title:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_TITLE)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE)
|
||||||
|
|
||||||
property is_punct:
|
property is_punct:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT)
|
||||||
|
|
||||||
property is_space:
|
property is_space:
|
||||||
def __get__(self): return check_flag(self.c.lex, IS_SPACE)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE)
|
||||||
|
|
||||||
property like_url:
|
property like_url:
|
||||||
def __get__(self): return check_flag(self.c.lex, LIKE_URL)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL)
|
||||||
|
|
||||||
property like_num:
|
property like_num:
|
||||||
def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM)
|
||||||
|
|
||||||
property like_email:
|
property like_email:
|
||||||
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
|
def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL)
|
||||||
|
|
||||||
|
|
||||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||||
|
|
|
@ -27,15 +27,16 @@ cdef class Vocab:
|
||||||
cpdef public lexeme_props_getter
|
cpdef public lexeme_props_getter
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef readonly object pos_tags
|
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object _serializer
|
cdef public object _serializer
|
||||||
cdef public object data_dir
|
cdef public object data_dir
|
||||||
cdef public float oov_prob
|
cdef public object get_lex_attr
|
||||||
|
cdef public object pos_tags
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
|
||||||
|
|
||||||
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _by_hash
|
cdef PreshMap _by_hash
|
||||||
|
|
|
@ -12,7 +12,6 @@ import math
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport set_lex_struct_props
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
def __init__(self, data_dir=None, get_lex_attr=None):
|
def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.pos_tags = pos_tags if pos_tags is not None else {}
|
|
||||||
|
|
||||||
self.get_lex_attr = get_lex_attr
|
self.get_lex_attr = get_lex_attr
|
||||||
self.repvec_length = 0
|
self.repvec_length = 0
|
||||||
self.length = 0
|
self.length = 1
|
||||||
self._add_lex_to_vocab(0, &EMPTY_LEXEME)
|
self.pos_tags = pos_tags
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||||
|
@ -84,7 +81,10 @@ cdef class Vocab:
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
cdef hash_t key = hash_string(string)
|
cdef hash_t key = hash_string(string)
|
||||||
lex = <LexemeC*>self._by_hash.get(key)
|
lex = <LexemeC*>self._by_hash.get(key)
|
||||||
|
cdef size_t addr
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
|
print string, lex.orth, self.strings[string]
|
||||||
|
assert lex.orth == self.strings[string]
|
||||||
return lex
|
return lex
|
||||||
else:
|
else:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
@ -103,15 +103,24 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||||
|
cdef hash_t key
|
||||||
cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
if len(string) < 3:
|
mem = self.mem
|
||||||
mem = self.mem
|
#if len(string) < 3:
|
||||||
|
# mem = self.mem
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
for attr, func in self.lex_attr_getters.items():
|
lex.orth = self.strings[string]
|
||||||
Lexeme.set_struct_attr(lex, attr, func(string))
|
lex.id = self.length
|
||||||
|
if self.get_lex_attr is not None:
|
||||||
|
for attr, func in self.get_lex_attr.items():
|
||||||
|
value = func(string)
|
||||||
|
if isinstance(value, unicode):
|
||||||
|
value = self.strings[value]
|
||||||
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if is_oov:
|
if is_oov:
|
||||||
lex.id = 0
|
lex.id = 0
|
||||||
else:
|
else:
|
||||||
|
key = hash_string(string)
|
||||||
self._add_lex_to_vocab(key, lex)
|
self._add_lex_to_vocab(key, lex)
|
||||||
assert lex != NULL, string
|
assert lex != NULL, string
|
||||||
return lex
|
return lex
|
||||||
|
@ -119,13 +128,14 @@ cdef class Vocab:
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
self._by_hash.set(key, <void*>lex)
|
self._by_hash.set(key, <void*>lex)
|
||||||
self._by_orth.set(lex.orth, <void*>lex)
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
|
print "Add lex", key, lex.orth, self.strings[lex.orth]
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
for orth, addr in self._by_orth.items():
|
for orth, addr in self._by_orth.items():
|
||||||
yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length)
|
yield Lexeme(self, orth)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
|
@ -142,22 +152,12 @@ cdef class Vocab:
|
||||||
An instance of the Lexeme Python class, with data copied on
|
An instance of the Lexeme Python class, with data copied on
|
||||||
instantiation.
|
instantiation.
|
||||||
'''
|
'''
|
||||||
cdef const LexemeC* lexeme
|
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == unicode:
|
||||||
orth = id_or_string
|
orth = self.strings[id_or_string]
|
||||||
lexeme = <LexemeC*>self._by_orth.get(orth)
|
|
||||||
if lexeme == NULL:
|
|
||||||
raise KeyError(id_or_string)
|
|
||||||
assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth))
|
|
||||||
elif type(id_or_string) == unicode:
|
|
||||||
lexeme = self.get(self.mem, id_or_string)
|
|
||||||
assert lexeme.orth == self.strings[id_or_string]
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("Vocab unable to map type: "
|
orth = id_or_string
|
||||||
"%s. Maps unicode --> Lexeme or "
|
return Lexeme(self, orth)
|
||||||
"int --> Lexeme" % str(type(id_or_string)))
|
|
||||||
return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length)
|
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user