mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Tmp. Working on refactor. Compiles, must hook up lexical feats.
This commit is contained in:
parent
46da3d74d2
commit
0930892fc1
|
@ -12,7 +12,10 @@ from .attrs import get_flags
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {'flags': get_flags(string), 'dense': 1}
|
return {'flags': get_flags(string), 'length': len(string),
|
||||||
|
'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
|
||||||
|
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
|
||||||
|
'sentiment': 0}
|
||||||
|
|
||||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
|
@ -45,7 +48,7 @@ class English(object):
|
||||||
"""
|
"""
|
||||||
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
||||||
self._data_dir = data_dir
|
self._data_dir = data_dir
|
||||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||||
get_lex_props=get_lex_props)
|
get_lex_props=get_lex_props)
|
||||||
tag_names = list(POS_TAGS.keys())
|
tag_names = list(POS_TAGS.keys())
|
||||||
tag_names.sort()
|
tag_names.sort()
|
||||||
|
|
|
@ -283,12 +283,12 @@ cdef class EnPosTagger:
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
cdef bytes py_string = self.strings[lex.sic]
|
cdef unicode py_string = self.strings[lex.sic]
|
||||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos)
|
lemma_strings = self.lemmatizer(py_string, pos)
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
|
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
|
||||||
return lemma
|
return lemma
|
||||||
|
|
|
@ -7,9 +7,7 @@ from .strings cimport StringStore
|
||||||
cdef LexemeC EMPTY_LEXEME
|
cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1
|
||||||
dict props) except *
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
cdef const float* vec
|
cdef const float* vec
|
||||||
|
|
|
@ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
from .typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1:
|
||||||
StringStore string_store, dict props) except *:
|
|
||||||
cdef LexemeC lex
|
|
||||||
lex.id = i
|
|
||||||
lex.length = len(string)
|
|
||||||
lex.sic = string_store[string]
|
|
||||||
|
|
||||||
lex.cluster = props.get('cluster', 0)
|
lex.length = props['length']
|
||||||
lex.prob = props.get('prob', 0)
|
lex.sic = string_store[props['sic']]
|
||||||
|
lex.norm1 = string_store[props['norm1']]
|
||||||
|
lex.norm2 = string_store[props['norm2']]
|
||||||
|
lex.shape = string_store[props['shape']]
|
||||||
|
lex.prefix = string_store[props['prefix']]
|
||||||
|
lex.suffix = string_store[props['suffix']]
|
||||||
|
|
||||||
lex.prefix = string_store[string[:1]]
|
lex.cluster = props['cluster']
|
||||||
lex.suffix = string_store[string[-3:]]
|
lex.prob = props['prob']
|
||||||
lex.shape = string_store[word_shape(string)]
|
lex.sentiment = props['sentiment']
|
||||||
|
|
||||||
lex.flags = props.get('flags', 0)
|
lex.flags = props['flags']
|
||||||
return lex
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
|
|
|
@ -67,7 +67,7 @@ cdef class StringStore:
|
||||||
if string_or_id < 1 or string_or_id >= self.size:
|
if string_or_id < 1 or string_or_id >= self.size:
|
||||||
raise IndexError(string_or_id)
|
raise IndexError(string_or_id)
|
||||||
utf8str = &self.strings[<int>string_or_id]
|
utf8str = &self.strings[<int>string_or_id]
|
||||||
return utf8str.chars[:utf8str.length]
|
return utf8str.chars[:utf8str.length].decode('utf8')
|
||||||
elif isinstance(string_or_id, bytes):
|
elif isinstance(string_or_id, bytes):
|
||||||
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
utf8str = self.intern(<char*>string_or_id, len(string_or_id))
|
||||||
return utf8str.i
|
return utf8str.i
|
||||||
|
|
|
@ -42,32 +42,5 @@ cdef class Tokens:
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef cvarray vec
|
cdef readonly Tokens _seq
|
||||||
|
cdef readonly int i
|
||||||
cdef readonly flags_t flags
|
|
||||||
|
|
||||||
cdef readonly attr_t id
|
|
||||||
cdef readonly attr_t sic
|
|
||||||
cdef readonly attr_t dense
|
|
||||||
cdef readonly attr_t shape
|
|
||||||
cdef readonly attr_t prefix
|
|
||||||
cdef readonly attr_t suffix
|
|
||||||
|
|
||||||
cdef readonly attr_t length
|
|
||||||
cdef readonly attr_t cluster
|
|
||||||
cdef readonly attr_t pos_type
|
|
||||||
|
|
||||||
cdef readonly float prob
|
|
||||||
cdef readonly float sentiment
|
|
||||||
|
|
||||||
cdef readonly Morphology morph
|
|
||||||
cdef readonly univ_tag_t pos
|
|
||||||
cdef readonly int fine_pos
|
|
||||||
cdef readonly int idx
|
|
||||||
cdef readonly int lemma
|
|
||||||
cdef readonly int sense
|
|
||||||
cdef readonly int dep_tag
|
|
||||||
|
|
||||||
cdef readonly int head_offset
|
|
||||||
cdef readonly uint32_t l_kids
|
|
||||||
cdef readonly uint32_t r_kids
|
|
||||||
|
|
188
spacy/tokens.pyx
188
spacy/tokens.pyx
|
@ -85,7 +85,7 @@ cdef class Tokens:
|
||||||
token (Token):
|
token (Token):
|
||||||
"""
|
"""
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return cinit_token(&self.data[i])
|
return Token(self, i)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the tokens.
|
"""Iterate over the tokens.
|
||||||
|
@ -174,38 +174,26 @@ cdef class Tokens:
|
||||||
self.data[i].lex = &EMPTY_LEXEME
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cdef Token cinit_token(const TokenC* c_tok):
|
@cython.freelist(64)
|
||||||
cdef Token py_tok = Token.__new__(Token)
|
|
||||||
py_tok.morph = c_tok.morph
|
|
||||||
py_tok.pos = c_tok.pos
|
|
||||||
py_tok.fine_pos = c_tok.fine_pos
|
|
||||||
py_tok.idx = c_tok.idx
|
|
||||||
py_tok.lemma = c_tok.lemma
|
|
||||||
py_tok.sense = c_tok.sense
|
|
||||||
py_tok.dep_tag = c_tok.dep_tag
|
|
||||||
py_tok.head_offset = c_tok.head
|
|
||||||
py_tok.l_kids = c_tok.l_kids
|
|
||||||
py_tok.r_kids = c_tok.r_kids
|
|
||||||
return py_tok
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token.
|
"""An individual token.
|
||||||
"""
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
#self._seq = tokens
|
|
||||||
#self.i = i
|
|
||||||
|
|
||||||
#def __unicode__(self):
|
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
||||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
object.
|
||||||
# cdef int end_idx = t.idx + t.lex.length
|
"""
|
||||||
# if self.i + 1 == self._seq.length:
|
def __init__(self, Tokens tokens, int i):
|
||||||
# return self.string
|
self._seq = tokens
|
||||||
# if end_idx == t[1].idx:
|
self.i = i
|
||||||
# return self.string
|
|
||||||
# else:
|
def __unicode__(self):
|
||||||
# return self.string + ' '
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
cdef int end_idx = t.idx + t.lex.length
|
||||||
|
if self.i + 1 == self._seq.length:
|
||||||
|
return self.string
|
||||||
|
if end_idx == t[1].idx:
|
||||||
|
return self.string
|
||||||
|
else:
|
||||||
|
return self.string + ' '
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of unicode code-points in the original string.
|
"""The number of unicode code-points in the original string.
|
||||||
|
@ -213,87 +201,87 @@ cdef class Token:
|
||||||
Returns:
|
Returns:
|
||||||
length (int):
|
length (int):
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
#property idx:
|
property idx:
|
||||||
# """The index into the original string at which the token starts.
|
"""The index into the original string at which the token starts.
|
||||||
|
|
||||||
# The following is supposed to always be true:
|
The following is supposed to always be true:
|
||||||
#
|
|
||||||
# >>> original_string[token.idx:token.idx len(token) == token.string
|
|
||||||
# """
|
|
||||||
# def __get__(self):
|
|
||||||
# return self._seq.data[self.i].idx
|
|
||||||
|
|
||||||
#property cluster:
|
>>> original_string[token.idx:token.idx len(token) == token.string
|
||||||
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
"""
|
||||||
#
|
def __get__(self):
|
||||||
# Similar words have better-than-chance likelihood of having similar cluster
|
return self._seq.data[self.i].idx
|
||||||
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
|
||||||
# and help to make models slightly more robust to domain variation.
|
|
||||||
|
|
||||||
# A common trick is to use only the first N bits of a cluster ID in a feature,
|
property cluster:
|
||||||
# as the more general part of the hierarchical clustering is often more accurate
|
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||||
# than the lower categories.
|
|
||||||
|
|
||||||
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
Similar words have better-than-chance likelihood of having similar cluster
|
||||||
# bit-mask:
|
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||||
|
and help to make models slightly more robust to domain variation.
|
||||||
|
|
||||||
# >>> six_bits = cluster & (2**6 - 1)
|
A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||||
# """
|
as the more general part of the hierarchical clustering is often more accurate
|
||||||
# def __get__(self):
|
than the lower categories.
|
||||||
# return self._seq.data[self.i].lex.cluster
|
|
||||||
|
|
||||||
#property string:
|
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||||
# """The unicode string of the word, with no whitespace padding."""
|
bit-mask:
|
||||||
# def __get__(self):
|
|
||||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
|
||||||
# if t.lex.sic == 0:
|
|
||||||
# return ''
|
|
||||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
|
||||||
# return utf8string.decode('utf8')
|
|
||||||
|
|
||||||
#property lemma:
|
>>> six_bits = cluster & (2**6 - 1)
|
||||||
# """The unicode string of the word's lemma. If no part-of-speech tag is
|
"""
|
||||||
# assigned, the most common part-of-speech tag of the word is used.
|
def __get__(self):
|
||||||
# """
|
return self._seq.data[self.i].lex.cluster
|
||||||
# def __get__(self):
|
|
||||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
|
||||||
# if t.lemma == 0:
|
|
||||||
# return self.string
|
|
||||||
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
|
||||||
# return utf8string.decode('utf8')
|
|
||||||
|
|
||||||
#property dep_tag:
|
property string:
|
||||||
# """The ID integer of the word's dependency label. If no parse has been
|
"""The unicode string of the word, with no whitespace padding."""
|
||||||
# assigned, defaults to 0.
|
def __get__(self):
|
||||||
# """
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
# def __get__(self):
|
if t.lex.sic == 0:
|
||||||
# return self._seq.data[self.i].dep_tag
|
return ''
|
||||||
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
||||||
|
return py_ustr
|
||||||
|
|
||||||
#property pos:
|
property lemma:
|
||||||
# """The ID integer of the word's part-of-speech tag, from the 13-tag
|
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
||||||
# Google Universal Tag Set. Constants for this tag set are available in
|
assigned, the most common part-of-speech tag of the word is used.
|
||||||
# spacy.typedefs.
|
"""
|
||||||
# """
|
def __get__(self):
|
||||||
# def __get__(self):
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
# return self._seq.data[self.i].pos
|
if t.lemma == 0:
|
||||||
|
return self.string
|
||||||
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||||
|
return py_ustr
|
||||||
|
|
||||||
#property fine_pos:
|
property dep_tag:
|
||||||
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
"""The ID integer of the word's dependency label. If no parse has been
|
||||||
# by the tagger model. Fine-grained tags include morphological information,
|
assigned, defaults to 0.
|
||||||
# and other distinctions, and allow a more accurate tagger to be trained.
|
"""
|
||||||
# """
|
def __get__(self):
|
||||||
|
return self._seq.data[self.i].dep_tag
|
||||||
|
|
||||||
# def __get__(self):
|
property pos:
|
||||||
# return self._seq.data[self.i].fine_pos
|
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||||
|
Google Universal Tag Set. Constants for this tag set are available in
|
||||||
|
spacy.typedefs.
|
||||||
|
"""
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.data[self.i].pos
|
||||||
|
|
||||||
#property sic:
|
property fine_pos:
|
||||||
# def __get__(self):
|
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||||
# return self._seq.data[self.i].lex.sic
|
by the tagger model. Fine-grained tags include morphological information,
|
||||||
|
and other distinctions, and allow a more accurate tagger to be trained.
|
||||||
|
"""
|
||||||
|
|
||||||
#property head:
|
def __get__(self):
|
||||||
# """The token predicted by the parser to be the head of the current token."""
|
return self._seq.data[self.i].fine_pos
|
||||||
# def __get__(self):
|
|
||||||
# cdef const TokenC* t = &self._seq.data[self.i]
|
property sic:
|
||||||
# return Token(self._seq, self.i + t.head)
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
|
||||||
|
|
||||||
|
property head:
|
||||||
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
|
def __get__(self):
|
||||||
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
return Token(self._seq, self.i + t.head)
|
||||||
|
|
|
@ -24,12 +24,13 @@ cdef struct _Cached:
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
cpdef public get_lex_props
|
cpdef public lexeme_props_getter
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[LexemeC*] lexemes
|
cdef vector[const LexemeC*] lexemes
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||||
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from os import path
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport set_lex_struct_props
|
||||||
from .lexeme cimport Lexeme_cinit
|
from .lexeme cimport Lexeme_cinit
|
||||||
from .strings cimport slice_unicode
|
from .strings cimport slice_unicode
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
|
@ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
EMPTY_LEXEME.vec = EMPTY_VEC
|
EMPTY_LEXEME.vec = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
|
|
||||||
StringStore string_store, dict props) except *:
|
|
||||||
cdef LexemeC lex
|
|
||||||
lex.id = i
|
|
||||||
lex.length = len(string)
|
|
||||||
lex.sic = string_store[string]
|
|
||||||
|
|
||||||
lex.cluster = props.get('cluster', 0)
|
|
||||||
lex.prob = props.get('prob', 0)
|
|
||||||
|
|
||||||
lex.prefix = string_store[string[:1]]
|
|
||||||
lex.suffix = string_store[string[-3:]]
|
|
||||||
lex.shape = string_store[word_shape(string)]
|
|
||||||
|
|
||||||
lex.flags = props.get('flags', 0)
|
|
||||||
return lex
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
|
@ -47,7 +29,7 @@ cdef class Vocab:
|
||||||
self._map = PreshMap(2 ** 20)
|
self._map = PreshMap(2 ** 20)
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
self.get_lex_props = get_lex_props
|
self.lexeme_props_getter = get_lex_props
|
||||||
|
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
|
@ -63,32 +45,36 @@ cdef class Vocab:
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.lexemes.size()
|
return self.lexemes.size()
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
lex = <LexemeC*>self._map.get(string.key)
|
lex = <LexemeC*>self._map.get(c_str.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
if string.n < 3:
|
if c_str.n < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
cdef unicode py_string = string.chars[:string.n]
|
cdef unicode py_str = c_str.chars[:c_str.n]
|
||||||
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
|
props = self.lexeme_props_getter(py_str)
|
||||||
self.get_lex_props(py_string))
|
set_lex_struct_props(lex, props, self.strings)
|
||||||
if mem is self.mem:
|
if mem is self.mem:
|
||||||
self._map.set(string.key, lex)
|
lex.id = self.lexemes.size()
|
||||||
while self.lexemes.size() < (lex.id + 1):
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
|
||||||
self.lexemes[lex.id] = lex
|
|
||||||
else:
|
else:
|
||||||
lex[0].id = 1
|
lex.id = 1
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||||
|
self._map.set(key, <void*>lex)
|
||||||
|
while self.lexemes.size() < (lex.id + 1):
|
||||||
|
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||||
|
self.lexemes[lex.id] = lex
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
unseen unicode string is given, a new LexemeC is created and stored.
|
unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||||
|
@ -100,24 +86,28 @@ cdef class Vocab:
|
||||||
lexeme (Lexeme): An instance of the Lexeme Python class, with data
|
lexeme (Lexeme): An instance of the Lexeme Python class, with data
|
||||||
copied on instantiation.
|
copied on instantiation.
|
||||||
'''
|
'''
|
||||||
cdef UniStr string
|
cdef UniStr c_str
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == int:
|
||||||
if id_or_string >= self.lexemes.size():
|
if id_or_string >= self.lexemes.size():
|
||||||
raise IndexError
|
raise IndexError
|
||||||
lexeme = self.lexemes.at(id_or_string)
|
lexeme = self.lexemes.at(id_or_string)
|
||||||
else:
|
else:
|
||||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
||||||
lexeme = self.get(self.mem, &string)
|
lexeme = self.get(self.mem, &c_str)
|
||||||
return Lexeme_cinit(lexeme, self.strings)
|
return Lexeme_cinit(lexeme, self.strings)
|
||||||
|
|
||||||
def __setitem__(self, unicode uni_string, dict props):
|
def __setitem__(self, unicode py_str, dict props):
|
||||||
cdef UniStr s
|
cdef UniStr c_str
|
||||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
slice_unicode(&c_str, py_str, 0, len(py_str))
|
||||||
# Cast through the const here, since we're allowed to change our own
|
cdef LexemeC* lex
|
||||||
# LexemeCs.
|
lex = <LexemeC*>self._map.get(c_str.key)
|
||||||
lex = <LexemeC*><void*>self.get(self.mem, &s)
|
if lex == NULL:
|
||||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
|
lex.id = self.lexemes.size()
|
||||||
|
self._add_lex_to_vocab(c_str.key, lex)
|
||||||
|
set_lex_struct_props(lex, props, self.strings)
|
||||||
|
assert lex.sic < 1000000
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
if path.exists(loc):
|
if path.exists(loc):
|
||||||
|
@ -154,6 +144,7 @@ cdef class Vocab:
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
|
lexeme.vec = EMPTY_VEC
|
||||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
|
|
Loading…
Reference in New Issue
Block a user