* Tmp commit. Refactoring to create a Python Lexeme class.

This commit is contained in:
Matthew Honnibal 2015-01-12 10:26:22 +11:00
parent 61904e590f
commit ce2edd6312
16 changed files with 281 additions and 173 deletions

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals
from os import path
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
@ -10,12 +11,10 @@ from .pos import POS_TAGS
from .attrs import get_flags
DATA_DIR = path.join(path.dirname(__file__), 'data')
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
class English(object):
"""The English NLP pipeline.
@ -44,16 +43,18 @@ class English(object):
parser (spacy.syntax.parser.GreedyParser):
A greedy shift-reduce dependency parser.
"""
def __init__(self, data_dir=None):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
def __init__(self, data_dir=LOCAL_DATA_DIR):
self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
get_lex_props=get_lex_props)
tag_names = list(POS_TAGS.keys())
tag_names.sort()
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
POS_TAGS, tag_names)
if data_dir is None:
self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
POS_TAGS, tag_names)
else:
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
POS_TAGS, tag_names)
self.strings = self.vocab.strings
self._tagger = None
self._parser = None

View File

@ -4,7 +4,7 @@ import tarfile
import shutil
import requests
URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')

View File

@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
from .._ml cimport Model
from ..strings cimport StringStore
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..typedefs cimport univ_tag_t
from .lemmatizer import Lemmatizer
@ -21,5 +21,5 @@ cdef class EnPosTagger:
cdef readonly int n_tags
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1

View File

@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, Lexeme
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
@ -290,7 +290,7 @@ cdef class EnPosTagger:
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]

View File

@ -1,21 +1,21 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
from .structs cimport Lexeme
from .structs cimport LexemeC
from .strings cimport StringStore
cdef Lexeme EMPTY_LEXEME
cdef LexemeC EMPTY_LEXEME
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
dict props) except *
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:

View File

@ -7,12 +7,12 @@ from libc.string cimport memset
from .orth cimport word_shape
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *:
cdef Lexeme lex
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.flags = props.get('flags', 0)
return lex

View File

@ -1,4 +1,4 @@
from .structs cimport TokenC, Lexeme, Morphology, PosTag
from .structs cimport TokenC, Morphology, PosTag
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1

View File

@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
from murmurhash.mrmr cimport hash64
from .structs cimport Utf8Str, UniStr
from .typedefs cimport hash_t
cpdef hash_t hash_string(unicode string) except 0
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:

View File

@ -1,6 +1,7 @@
import codecs
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash64
from .typedefs cimport hash_t
@ -9,6 +10,11 @@ from .typedefs cimport hash_t
SEPARATOR = '\n|-SEP-|\n'
cpdef hash_t hash_string(unicode string) except 0:
chars = <Py_UNICODE*>string
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
"""
cdef class SymbolMap:
def __init__(self):

View File

@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
cdef struct Lexeme:
cdef struct LexemeC:
const float* vec
flags_t flags
attr_t id
@ -38,7 +40,7 @@ cdef struct PosTag:
cdef struct TokenC:
const Lexeme* lex
const LexemeC* lex
Morphology morph
univ_tag_t pos
int fine_pos

View File

@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport Lexeme, TokenC, Morphology, UniStr
from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .strings cimport StringStore
from .tokens cimport Tokens
from .vocab cimport Vocab, _Cached
cdef union LexemesOrTokens:
const Lexeme* const* lexemes
const LexemeC* const* lexemes
TokenC* tokens
@ -33,10 +33,10 @@ cdef class Tokenizer:
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -53,7 +53,7 @@ cdef class Tokenizer:
cdef int idx = 0
for i, py_string in enumerate(strings):
slice_unicode(&string_struct, py_string, 0, len(py_string))
tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct))
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
idx += len(py_string) + 1
return tokens
@ -75,7 +75,7 @@ cdef class Tokenizer:
string (unicode): The string to be tokenized.
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
"""
cdef int length = len(string)
cdef Tokens tokens = Tokens(self.vocab, length)
@ -121,8 +121,8 @@ cdef class Tokenizer:
return True
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef hash_t orig_key
cdef int orig_size
orig_key = span.key
@ -131,8 +131,8 @@ cdef class Tokenizer:
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
vector[const Lexeme*] *suffixes) except NULL:
cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except NULL:
cdef size_t i
cdef UniStr prefix
cdef UniStr suffix
@ -174,12 +174,12 @@ cdef class Tokenizer:
return string
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[const Lexeme*] *prefixes,
vector[const Lexeme*] *suffixes) except -1:
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit
cdef int split
cdef const Lexeme* const* lexemes
cdef Lexeme* lexeme
cdef const LexemeC* const* lexemes
cdef LexemeC* lexeme
cdef UniStr span
cdef int i
if prefixes.size():
@ -200,7 +200,7 @@ cdef class Tokenizer:
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
slice_unicode(&span, string.chars, split + 1, string.n)
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
idx = tokens.push_back(idx, deref(it))
preinc(it)
@ -213,10 +213,10 @@ cdef class Tokenizer:
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = n
cached.is_lex = True
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
for i in range(n):
lexemes[i] = tokens[i].lex
cached.data.lexemes = <const Lexeme* const*>lexemes
cached.data.lexemes = <const LexemeC* const*>lexemes
self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
@ -243,7 +243,7 @@ cdef class Tokenizer:
cdef unicode form
cdef unicode lemma
cdef dict props
cdef Lexeme** lexemes
cdef LexemeC** lexemes
cdef hash_t hashed
cdef UniStr string
for chunk, substrings in sorted(rules.items()):
@ -252,7 +252,7 @@ cdef class Tokenizer:
form = props['F']
lemma = props.get("L", None)
slice_unicode(&string, form, 0, len(form))
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
if lemma:
tokens[i].lemma = self.vocab.strings[lemma]
if 'pos' in props:

View File

@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .typedefs cimport flags_t, attr_id_t, attr_t
from .structs cimport Morphology, TokenC, Lexeme
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore
ctypedef const Lexeme* const_Lexeme_ptr
ctypedef const LexemeC* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef fused LexemeOrToken:
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
TokenC_ptr
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
@ -42,5 +42,32 @@ cdef class Tokens:
cdef class Token:
cdef Tokens _seq
cdef readonly int i
cdef cvarray vec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t sic
cdef readonly attr_t dense
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly attr_t length
cdef readonly attr_t cluster
cdef readonly attr_t pos_type
cdef readonly float prob
cdef readonly float sentiment
cdef readonly Morphology morph
cdef readonly univ_tag_t pos
cdef readonly int fine_pos
cdef readonly int idx
cdef readonly int lemma
cdef readonly int sense
cdef readonly int dep_tag
cdef readonly int head_offset
cdef readonly uint32_t l_kids
cdef readonly uint32_t r_kids

View File

@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
return get_lex_attr(token.lex, feat_name)
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
if feat_name < (sizeof(flags_t) * 8):
return check_flag(lex, feat_name)
elif feat_name == ID:
@ -85,7 +85,7 @@ cdef class Tokens:
token (Token):
"""
bounds_check(i, self.length, PADDING)
return Token(self, i)
return cinit_token(&self.data[i])
def __iter__(self):
"""Iterate over the tokens.
@ -174,26 +174,57 @@ cdef class Tokens:
self.data[i].lex = &EMPTY_LEXEME
@cython.freelist(64)
cdef Token cinit_token(const TokenC* c_tok):
cdef const LexemeC* lex = c_tok.lex
cdef Token py_tok = Token.__new__(Token)
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
py_tok.vec = cyarr
py_tok.flags = lex.flags
py_tok.id = lex.id
py_tok.sic = lex.sic
py_tok.dense = lex.dense
py_tok.shape = lex.shape
py_tok.prefix = lex.prefix
py_tok.suffix = lex.suffix
py_tok.length = lex.length
py_tok.cluster = lex.cluster
py_tok.pos_type = lex.pos_type
py_tok.prob = lex.prob
py_tok.sentiment = lex.sentiment
py_tok.morph = c_tok.morph
py_tok.pos = c_tok.pos
py_tok.fine_pos = c_tok.fine_pos
py_tok.idx = c_tok.idx
py_tok.lemma = c_tok.lemma
py_tok.sense = c_tok.sense
py_tok.dep_tag = c_tok.dep_tag
py_tok.head_offset = c_tok.head
py_tok.l_kids = c_tok.l_kids
py_tok.r_kids = c_tok.r_kids
return py_tok
cdef class Token:
"""An individual token.
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
object.
"""
def __init__(self, Tokens tokens, int i):
self._seq = tokens
self.i = i
def __init__(self):
pass
#self._seq = tokens
#self.i = i
def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i]
cdef int end_idx = t.idx + t.lex.length
if self.i + 1 == self._seq.length:
return self.string
if end_idx == t[1].idx:
return self.string
else:
return self.string + ' '
#def __unicode__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# cdef int end_idx = t.idx + t.lex.length
# if self.i + 1 == self._seq.length:
# return self.string
# if end_idx == t[1].idx:
# return self.string
# else:
# return self.string + ' '
def __len__(self):
"""The number of unicode code-points in the original string.
@ -201,87 +232,87 @@ cdef class Token:
Returns:
length (int):
"""
return self._seq.data[self.i].lex.length
return self.length
property idx:
"""The index into the original string at which the token starts.
#property idx:
# """The index into the original string at which the token starts.
The following is supposed to always be true:
# The following is supposed to always be true:
#
# >>> original_string[token.idx:token.idx len(token) == token.string
# """
# def __get__(self):
# return self._seq.data[self.i].idx
>>> original_string[token.idx:token.idx len(token) == token.string
"""
def __get__(self):
return self._seq.data[self.i].idx
#property cluster:
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
#
# Similar words have better-than-chance likelihood of having similar cluster
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
# and help to make models slightly more robust to domain variation.
property cluster:
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
# A common trick is to use only the first N bits of a cluster ID in a feature,
# as the more general part of the hierarchical clustering is often more accurate
# than the lower categories.
Similar words have better-than-chance likelihood of having similar cluster
IDs, although the clustering is quite noisy. Cluster IDs make good features,
and help to make models slightly more robust to domain variation.
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
# bit-mask:
A common trick is to use only the first N bits of a cluster ID in a feature,
as the more general part of the hierarchical clustering is often more accurate
than the lower categories.
# >>> six_bits = cluster & (2**6 - 1)
# """
# def __get__(self):
# return self._seq.data[self.i].lex.cluster
To assist in this, I encode the cluster IDs little-endian, to allow a simple
bit-mask:
#property string:
# """The unicode string of the word, with no whitespace padding."""
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lex.sic == 0:
# return ''
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
# return utf8string.decode('utf8')
>>> six_bits = cluster & (2**6 - 1)
"""
def __get__(self):
return self._seq.data[self.i].lex.cluster
#property lemma:
# """The unicode string of the word's lemma. If no part-of-speech tag is
# assigned, the most common part-of-speech tag of the word is used.
# """
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# if t.lemma == 0:
# return self.string
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
# return utf8string.decode('utf8')
property string:
"""The unicode string of the word, with no whitespace padding."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.sic == 0:
return ''
cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
return utf8string.decode('utf8')
#property dep_tag:
# """The ID integer of the word's dependency label. If no parse has been
# assigned, defaults to 0.
# """
# def __get__(self):
# return self._seq.data[self.i].dep_tag
property lemma:
"""The unicode string of the word's lemma. If no part-of-speech tag is
assigned, the most common part-of-speech tag of the word is used.
"""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lemma == 0:
return self.string
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
return utf8string.decode('utf8')
#property pos:
# """The ID integer of the word's part-of-speech tag, from the 13-tag
# Google Universal Tag Set. Constants for this tag set are available in
# spacy.typedefs.
# """
# def __get__(self):
# return self._seq.data[self.i].pos
property dep_tag:
"""The ID integer of the word's dependency label. If no parse has been
assigned, defaults to 0.
"""
def __get__(self):
return self._seq.data[self.i].dep_tag
#property fine_pos:
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
# by the tagger model. Fine-grained tags include morphological information,
# and other distinctions, and allow a more accurate tagger to be trained.
# """
property pos:
"""The ID integer of the word's part-of-speech tag, from the 13-tag
Google Universal Tag Set. Constants for this tag set are available in
spacy.typedefs.
"""
def __get__(self):
return self._seq.data[self.i].pos
# def __get__(self):
# return self._seq.data[self.i].fine_pos
property fine_pos:
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
by the tagger model. Fine-grained tags include morphological information,
and other distinctions, and allow a more accurate tagger to be trained.
"""
#property sic:
# def __get__(self):
# return self._seq.data[self.i].lex.sic
def __get__(self):
return self._seq.data[self.i].fine_pos
property sic:
def __get__(self):
return self._seq.data[self.i].lex.sic
property head:
"""The token predicted by the parser to be the head of the current token."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head)
#property head:
# """The token predicted by the parser to be the head of the current token."""
# def __get__(self):
# cdef const TokenC* t = &self._seq.data[self.i]
# return Token(self._seq, self.i + t.head)

View File

@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from .structs cimport Lexeme, TokenC, UniStr
from .structs cimport LexemeC, TokenC, UniStr
from .typedefs cimport utf8_t, id_t, hash_t
from .strings cimport StringStore
cdef Lexeme EMPTY_LEXEME
cdef LexemeC EMPTY_LEXEME
cdef union LexemesOrTokens:
const Lexeme* const* lexemes
const LexemeC* const* lexemes
TokenC* tokens
@ -27,9 +27,9 @@ cdef class Vocab:
cpdef public get_lex_props
cdef Pool mem
cpdef readonly StringStore strings
cdef vector[Lexeme*] lexemes
cdef vector[LexemeC*] lexemes
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef PreshMap _map

View File

@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset
from os import path
import codecs
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport init as lexeme_init
from .strings cimport slice_unicode
from .typedefs cimport flags_t
from .strings cimport hash_string
from .orth cimport word_shape
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
DEF MAX_VEC_SIZE = 100000
cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
cdef float[MAX_VEC_SIZE] EMPTY_VEC
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.vec = EMPTY_VEC
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
StringStore string_store, dict props) except *:
cdef Lexeme lex
cdef LexemeC lex
lex.id = i
lex.length = len(string)
lex.sic = string_store[string]
@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[word_shape(string)]
cdef object flags_val = props.get('flags', 0)
lex.flags = <flags_t>flags_val
lex.flags = props.get('flags', 0)
return lex
cdef class Vocab:
'''A map container for a language's Lexeme structs.
'''A map container for a language's LexemeC structs.
'''
def __init__(self, data_dir=None, get_lex_props=None):
self.mem = Pool()
@ -50,24 +56,25 @@ cdef class Vocab:
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings.txt'))
self.load(path.join(data_dir, 'lexemes.bin'))
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
#self.load_vectors(path.join(data_dir, 'deps.words'))
def __len__(self):
"""The current number of lexemes stored."""
return self.lexemes.size()
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef Lexeme* lex
lex = <Lexeme*>self._map.get(string.key)
cdef LexemeC* lex
lex = <LexemeC*>self._map.get(string.key)
if lex != NULL:
return lex
if string.n < 3:
mem = self.mem
cdef unicode py_string = string.chars[:string.n]
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
self.get_lex_props(py_string))
if mem is self.mem:
@ -81,13 +88,13 @@ cdef class Vocab:
def __getitem__(self, id_or_string):
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new Lexeme is created and stored.
unseen unicode string is given, a new LexemeC is created and stored.
This function relies on Cython's struct-to-dict conversion. Python clients
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
with int values. Cython clients can instead receive a Lexeme struct value.
with int values. Cython clients can instead receive a LexemeC struct value.
More efficient Cython access is provided by Lexicon.get, which returns
a Lexeme*.
a LexemeC*.
Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode
@ -96,24 +103,26 @@ cdef class Vocab:
is raised.
Returns:
lexeme (dict): A Lexeme struct instance, which Cython translates into
lexeme (dict): A LexemeC struct instance, which Cython translates into
a dict if the operator is called from Python.
'''
if type(id_or_string) == int:
if id_or_string >= self.lexemes.size():
raise IndexError
return self.lexemes.at(id_or_string)[0]
return {}
#return self.lexemes.at(id_or_string)[0]
cdef UniStr string
slice_unicode(&string, id_or_string, 0, len(id_or_string))
cdef const Lexeme* lexeme = self.get(self.mem, &string)
return lexeme[0]
cdef const LexemeC* lexeme = self.get(self.mem, &string)
return {}
#return lexeme[0]
def __setitem__(self, unicode uni_string, dict props):
cdef UniStr s
slice_unicode(&s, uni_string, 0, len(uni_string))
# Cast through the const here, since we're allowed to change our own
# Lexemes.
lex = <Lexeme*><void*>self.get(self.mem, &s)
# LexemeCs.
lex = <LexemeC*><void*>self.get(self.mem, &s)
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
def dump(self, loc):
@ -128,30 +137,30 @@ cdef class Vocab:
key = self._map.c_map.cells[i].key
if key == 0:
continue
lexeme = <Lexeme*>self._map.c_map.cells[i].value
lexeme = <LexemeC*>self._map.c_map.cells[i].value
st = fwrite(&key, sizeof(key), 1, fp)
assert st == 1
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
assert st == 1
st = fclose(fp)
assert st == 0
def load(self, loc):
def load_lexemes(self, loc):
if not path.exists(loc):
raise IOError('Lexemes file not found at %s' % loc)
raise IOError('LexemeCs file not found at %s' % loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
assert fp != NULL
cdef size_t st
cdef Lexeme* lexeme
cdef LexemeC* lexeme
cdef hash_t key
i = 0
while True:
st = fread(&key, sizeof(key), 1, fp)
if st != 1:
break
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
st = fread(lexeme, sizeof(Lexeme), 1, fp)
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
st = fread(lexeme, sizeof(LexemeC), 1, fp)
if st != 1:
break
self._map.set(key, lexeme)
@ -160,3 +169,29 @@ cdef class Vocab:
self.lexemes[lexeme.id] = lexeme
i += 1
fclose(fp)
def load_vectors(self, loc):
cdef int i
cdef unicode line
cdef unicode word
cdef unicode val_str
cdef hash_t key
cdef LexemeC* lex
cdef float* vec
with codecs.open(loc, 'r', 'utf8') as file_:
for line in file_:
pieces = line.split()
word = pieces.pop(0)
if len(pieces) >= MAX_VEC_SIZE:
sizes = (len(pieces), MAX_VEC_SIZE)
msg = ("Your vector is %d elements."
"The compile-time limit is %d elements." % sizes)
raise ValueError(msg)
key = hash_string(word)
lex = <LexemeC*>self._map.get(key)
if lex is not NULL:
vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
for i, val_str in enumerate(pieces):
vec[i] = float(val_str)
lex.vec = vec