mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class.
This commit is contained in:
parent
61904e590f
commit
ce2edd6312
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
from .. import orth
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokenizer import Tokenizer
|
||||||
from ..syntax.parser import GreedyParser
|
from ..syntax.parser import GreedyParser
|
||||||
|
@ -10,12 +11,10 @@ from .pos import POS_TAGS
|
||||||
from .attrs import get_flags
|
from .attrs import get_flags
|
||||||
|
|
||||||
|
|
||||||
DATA_DIR = path.join(path.dirname(__file__), 'data')
|
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {'flags': get_flags(string), 'dense': 1}
|
return {'flags': get_flags(string), 'dense': 1}
|
||||||
|
|
||||||
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
class English(object):
|
class English(object):
|
||||||
"""The English NLP pipeline.
|
"""The English NLP pipeline.
|
||||||
|
@ -44,16 +43,18 @@ class English(object):
|
||||||
parser (spacy.syntax.parser.GreedyParser):
|
parser (spacy.syntax.parser.GreedyParser):
|
||||||
A greedy shift-reduce dependency parser.
|
A greedy shift-reduce dependency parser.
|
||||||
"""
|
"""
|
||||||
def __init__(self, data_dir=None):
|
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
||||||
if data_dir is None:
|
|
||||||
data_dir = path.join(path.dirname(__file__), 'data')
|
|
||||||
self._data_dir = data_dir
|
self._data_dir = data_dir
|
||||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
||||||
get_lex_props=get_lex_props)
|
get_lex_props=get_lex_props)
|
||||||
tag_names = list(POS_TAGS.keys())
|
tag_names = list(POS_TAGS.keys())
|
||||||
tag_names.sort()
|
tag_names.sort()
|
||||||
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
if data_dir is None:
|
||||||
POS_TAGS, tag_names)
|
self.tokenizer = Tokenizer(self.vocab, {}, None, None, None,
|
||||||
|
POS_TAGS, tag_names)
|
||||||
|
else:
|
||||||
|
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
||||||
|
POS_TAGS, tag_names)
|
||||||
self.strings = self.vocab.strings
|
self.strings = self.vocab.strings
|
||||||
self._tagger = None
|
self._tagger = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
|
|
@ -4,7 +4,7 @@ import tarfile
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
PARSER_URL = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com/en.tgz'
|
||||||
|
|
||||||
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')
|
DEST_DIR = path.join(path.dirname(__file__), 'data', 'deps')
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .._ml cimport Model
|
from .._ml cimport Model
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
|
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
|
||||||
from ..typedefs cimport univ_tag_t
|
from ..typedefs cimport univ_tag_t
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
@ -21,5 +21,5 @@ cdef class EnPosTagger:
|
||||||
cdef readonly int n_tags
|
cdef readonly int n_tags
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..typedefs cimport univ_tag_t
|
||||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from ..typedefs cimport X, PUNCT, EOL
|
from ..typedefs cimport X, PUNCT, EOL
|
||||||
from ..typedefs cimport id_t
|
from ..typedefs cimport id_t
|
||||||
from ..structs cimport TokenC, Morphology, Lexeme
|
from ..structs cimport TokenC, Morphology, LexemeC
|
||||||
from ..tokens cimport Tokens
|
from ..tokens cimport Tokens
|
||||||
from ..morphology cimport set_morph_from_dict
|
from ..morphology cimport set_morph_from_dict
|
||||||
from .._ml cimport arg_max
|
from .._ml cimport arg_max
|
||||||
|
@ -290,7 +290,7 @@ cdef class EnPosTagger:
|
||||||
tokens[i].lemma = cached.lemma
|
tokens[i].lemma = cached.lemma
|
||||||
tokens[i].morph = cached.morph
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.sic
|
return lex.sic
|
||||||
cdef bytes py_string = self.strings[lex.sic]
|
cdef bytes py_string = self.strings[lex.sic]
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||||
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
||||||
from .structs cimport Lexeme
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed, StringStore store,
|
cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store,
|
||||||
dict props) except *
|
dict props) except *
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
if feat_name < (sizeof(flags_t) * 8):
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
return check_flag(lex, feat_name)
|
return check_flag(lex, feat_name)
|
||||||
elif feat_name == ID:
|
elif feat_name == ID:
|
||||||
|
|
|
@ -7,12 +7,12 @@ from libc.string cimport memset
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
cdef LexemeC init(id_t i, unicode string, hash_t hashed,
|
||||||
StringStore string_store, dict props) except *:
|
StringStore string_store, dict props) except *:
|
||||||
cdef Lexeme lex
|
cdef LexemeC lex
|
||||||
lex.id = i
|
lex.id = i
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.sic = string_store[string]
|
lex.sic = string_store[string]
|
||||||
|
@ -27,3 +27,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
||||||
|
|
||||||
lex.flags = props.get('flags', 0)
|
lex.flags = props.get('flags', 0)
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
from .structs cimport TokenC, Morphology, PosTag
|
||||||
|
|
||||||
|
|
||||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
||||||
|
|
|
@ -3,6 +3,9 @@ from preshed.maps cimport PreshMap
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .structs cimport Utf8Str, UniStr
|
from .structs cimport Utf8Str, UniStr
|
||||||
|
from .typedefs cimport hash_t
|
||||||
|
|
||||||
|
cpdef hash_t hash_string(unicode string) except 0
|
||||||
|
|
||||||
|
|
||||||
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) nogil:
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
|
@ -9,6 +10,11 @@ from .typedefs cimport hash_t
|
||||||
SEPARATOR = '\n|-SEP-|\n'
|
SEPARATOR = '\n|-SEP-|\n'
|
||||||
|
|
||||||
|
|
||||||
|
cpdef hash_t hash_string(unicode string) except 0:
|
||||||
|
chars = <Py_UNICODE*>string
|
||||||
|
return hash64(chars, len(string) * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
cdef class SymbolMap:
|
cdef class SymbolMap:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
|
@ -3,7 +3,9 @@ from libc.stdint cimport uint8_t, uint32_t
|
||||||
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct LexemeC:
|
||||||
|
const float* vec
|
||||||
|
|
||||||
flags_t flags
|
flags_t flags
|
||||||
|
|
||||||
attr_t id
|
attr_t id
|
||||||
|
@ -38,7 +40,7 @@ cdef struct PosTag:
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const Lexeme* lex
|
const LexemeC* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
univ_tag_t pos
|
univ_tag_t pos
|
||||||
int fine_pos
|
int fine_pos
|
||||||
|
|
|
@ -6,14 +6,14 @@ from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport Lexeme, TokenC, Morphology, UniStr
|
from .structs cimport LexemeC, TokenC, Morphology, UniStr
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
from .vocab cimport Vocab, _Cached
|
from .vocab cimport Vocab, _Cached
|
||||||
|
|
||||||
|
|
||||||
cdef union LexemesOrTokens:
|
cdef union LexemesOrTokens:
|
||||||
const Lexeme* const* lexemes
|
const LexemeC* const* lexemes
|
||||||
TokenC* tokens
|
TokenC* tokens
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,10 +33,10 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
|
||||||
vector[Lexeme*] *suffixes) except NULL
|
vector[LexemeC*] *suffixes) except NULL
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||||
vector[Lexeme*] *prefixes, vector[Lexeme*] *suffixes) except -1
|
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||||
|
|
|
@ -53,7 +53,7 @@ cdef class Tokenizer:
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
for i, py_string in enumerate(strings):
|
for i, py_string in enumerate(strings):
|
||||||
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
slice_unicode(&string_struct, py_string, 0, len(py_string))
|
||||||
tokens.push_back(idx, <const Lexeme*>self.vocab.get(tokens.mem, &string_struct))
|
tokens.push_back(idx, <const LexemeC*>self.vocab.get(tokens.mem, &string_struct))
|
||||||
idx += len(py_string) + 1
|
idx += len(py_string) + 1
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ cdef class Tokenizer:
|
||||||
string (unicode): The string to be tokenized.
|
string (unicode): The string to be tokenized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
||||||
"""
|
"""
|
||||||
cdef int length = len(string)
|
cdef int length = len(string)
|
||||||
cdef Tokens tokens = Tokens(self.vocab, length)
|
cdef Tokens tokens = Tokens(self.vocab, length)
|
||||||
|
@ -121,8 +121,8 @@ cdef class Tokenizer:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||||
cdef vector[Lexeme*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[Lexeme*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
cdef hash_t orig_key
|
cdef hash_t orig_key
|
||||||
cdef int orig_size
|
cdef int orig_size
|
||||||
orig_key = span.key
|
orig_key = span.key
|
||||||
|
@ -131,8 +131,8 @@ cdef class Tokenizer:
|
||||||
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, start, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[const Lexeme*] *prefixes,
|
cdef UniStr* _split_affixes(self, UniStr* string, vector[const LexemeC*] *prefixes,
|
||||||
vector[const Lexeme*] *suffixes) except NULL:
|
vector[const LexemeC*] *suffixes) except NULL:
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef UniStr prefix
|
cdef UniStr prefix
|
||||||
cdef UniStr suffix
|
cdef UniStr suffix
|
||||||
|
@ -174,12 +174,12 @@ cdef class Tokenizer:
|
||||||
return string
|
return string
|
||||||
|
|
||||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||||
vector[const Lexeme*] *prefixes,
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const Lexeme*] *suffixes) except -1:
|
vector[const LexemeC*] *suffixes) except -1:
|
||||||
cdef bint cache_hit
|
cdef bint cache_hit
|
||||||
cdef int split
|
cdef int split
|
||||||
cdef const Lexeme* const* lexemes
|
cdef const LexemeC* const* lexemes
|
||||||
cdef Lexeme* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef UniStr span
|
cdef UniStr span
|
||||||
cdef int i
|
cdef int i
|
||||||
if prefixes.size():
|
if prefixes.size():
|
||||||
|
@ -200,7 +200,7 @@ cdef class Tokenizer:
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
||||||
slice_unicode(&span, string.chars, split + 1, string.n)
|
slice_unicode(&span, string.chars, split + 1, string.n)
|
||||||
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
idx = tokens.push_back(idx, self.vocab.get(tokens.mem, &span))
|
||||||
cdef vector[const Lexeme*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
idx = tokens.push_back(idx, deref(it))
|
idx = tokens.push_back(idx, deref(it))
|
||||||
preinc(it)
|
preinc(it)
|
||||||
|
@ -213,10 +213,10 @@ cdef class Tokenizer:
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = n
|
cached.length = n
|
||||||
cached.is_lex = True
|
cached.is_lex = True
|
||||||
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
|
lexemes = <const LexemeC**>self.mem.alloc(n, sizeof(LexemeC**))
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
lexemes[i] = tokens[i].lex
|
lexemes[i] = tokens[i].lex
|
||||||
cached.data.lexemes = <const Lexeme* const*>lexemes
|
cached.data.lexemes = <const LexemeC* const*>lexemes
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
|
@ -243,7 +243,7 @@ cdef class Tokenizer:
|
||||||
cdef unicode form
|
cdef unicode form
|
||||||
cdef unicode lemma
|
cdef unicode lemma
|
||||||
cdef dict props
|
cdef dict props
|
||||||
cdef Lexeme** lexemes
|
cdef LexemeC** lexemes
|
||||||
cdef hash_t hashed
|
cdef hash_t hashed
|
||||||
cdef UniStr string
|
cdef UniStr string
|
||||||
for chunk, substrings in sorted(rules.items()):
|
for chunk, substrings in sorted(rules.items()):
|
||||||
|
@ -252,7 +252,7 @@ cdef class Tokenizer:
|
||||||
form = props['F']
|
form = props['F']
|
||||||
lemma = props.get("L", None)
|
lemma = props.get("L", None)
|
||||||
slice_unicode(&string, form, 0, len(form))
|
slice_unicode(&string, form, 0, len(form))
|
||||||
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
|
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
|
||||||
if lemma:
|
if lemma:
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
if 'pos' in props:
|
if 'pos' in props:
|
||||||
|
|
|
@ -5,13 +5,13 @@ from cython.view cimport array as cvarray
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
from .typedefs cimport flags_t, attr_id_t, attr_t
|
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
|
||||||
from .structs cimport Morphology, TokenC, Lexeme
|
from .structs cimport Morphology, TokenC, LexemeC
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
ctypedef const Lexeme* const_Lexeme_ptr
|
ctypedef const LexemeC* const_Lexeme_ptr
|
||||||
ctypedef TokenC* TokenC_ptr
|
ctypedef TokenC* TokenC_ptr
|
||||||
|
|
||||||
ctypedef fused LexemeOrToken:
|
ctypedef fused LexemeOrToken:
|
||||||
|
@ -19,10 +19,10 @@ ctypedef fused LexemeOrToken:
|
||||||
TokenC_ptr
|
TokenC_ptr
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil
|
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
|
||||||
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
|
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
|
||||||
|
|
||||||
cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil:
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
return lexeme.flags & (1 << flag_id)
|
return lexeme.flags & (1 << flag_id)
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,5 +42,32 @@ cdef class Tokens:
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef Tokens _seq
|
cdef cvarray vec
|
||||||
cdef readonly int i
|
|
||||||
|
cdef readonly flags_t flags
|
||||||
|
|
||||||
|
cdef readonly attr_t id
|
||||||
|
cdef readonly attr_t sic
|
||||||
|
cdef readonly attr_t dense
|
||||||
|
cdef readonly attr_t shape
|
||||||
|
cdef readonly attr_t prefix
|
||||||
|
cdef readonly attr_t suffix
|
||||||
|
|
||||||
|
cdef readonly attr_t length
|
||||||
|
cdef readonly attr_t cluster
|
||||||
|
cdef readonly attr_t pos_type
|
||||||
|
|
||||||
|
cdef readonly float prob
|
||||||
|
cdef readonly float sentiment
|
||||||
|
|
||||||
|
cdef readonly Morphology morph
|
||||||
|
cdef readonly univ_tag_t pos
|
||||||
|
cdef readonly int fine_pos
|
||||||
|
cdef readonly int idx
|
||||||
|
cdef readonly int lemma
|
||||||
|
cdef readonly int sense
|
||||||
|
cdef readonly int dep_tag
|
||||||
|
|
||||||
|
cdef readonly int head_offset
|
||||||
|
cdef readonly uint32_t l_kids
|
||||||
|
cdef readonly uint32_t r_kids
|
||||||
|
|
207
spacy/tokens.pyx
207
spacy/tokens.pyx
|
@ -32,7 +32,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||||
return get_lex_attr(token.lex, feat_name)
|
return get_lex_attr(token.lex, feat_name)
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
|
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
if feat_name < (sizeof(flags_t) * 8):
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
return check_flag(lex, feat_name)
|
return check_flag(lex, feat_name)
|
||||||
elif feat_name == ID:
|
elif feat_name == ID:
|
||||||
|
@ -85,7 +85,7 @@ cdef class Tokens:
|
||||||
token (Token):
|
token (Token):
|
||||||
"""
|
"""
|
||||||
bounds_check(i, self.length, PADDING)
|
bounds_check(i, self.length, PADDING)
|
||||||
return Token(self, i)
|
return cinit_token(&self.data[i])
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the tokens.
|
"""Iterate over the tokens.
|
||||||
|
@ -174,26 +174,57 @@ cdef class Tokens:
|
||||||
self.data[i].lex = &EMPTY_LEXEME
|
self.data[i].lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
@cython.freelist(64)
|
cdef Token cinit_token(const TokenC* c_tok):
|
||||||
|
cdef const LexemeC* lex = c_tok.lex
|
||||||
|
cdef Token py_tok = Token.__new__(Token)
|
||||||
|
|
||||||
|
cyarr = cvarray(shape=(300,), itemsize=sizeof(float), format="i")
|
||||||
|
py_tok.vec = cyarr
|
||||||
|
|
||||||
|
py_tok.flags = lex.flags
|
||||||
|
py_tok.id = lex.id
|
||||||
|
py_tok.sic = lex.sic
|
||||||
|
py_tok.dense = lex.dense
|
||||||
|
py_tok.shape = lex.shape
|
||||||
|
py_tok.prefix = lex.prefix
|
||||||
|
py_tok.suffix = lex.suffix
|
||||||
|
py_tok.length = lex.length
|
||||||
|
py_tok.cluster = lex.cluster
|
||||||
|
py_tok.pos_type = lex.pos_type
|
||||||
|
|
||||||
|
py_tok.prob = lex.prob
|
||||||
|
py_tok.sentiment = lex.sentiment
|
||||||
|
|
||||||
|
py_tok.morph = c_tok.morph
|
||||||
|
py_tok.pos = c_tok.pos
|
||||||
|
py_tok.fine_pos = c_tok.fine_pos
|
||||||
|
py_tok.idx = c_tok.idx
|
||||||
|
py_tok.lemma = c_tok.lemma
|
||||||
|
py_tok.sense = c_tok.sense
|
||||||
|
py_tok.dep_tag = c_tok.dep_tag
|
||||||
|
py_tok.head_offset = c_tok.head
|
||||||
|
py_tok.l_kids = c_tok.l_kids
|
||||||
|
py_tok.r_kids = c_tok.r_kids
|
||||||
|
return py_tok
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token.
|
"""An individual token.
|
||||||
|
|
||||||
Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens
|
|
||||||
object.
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Tokens tokens, int i):
|
def __init__(self):
|
||||||
self._seq = tokens
|
pass
|
||||||
self.i = i
|
#self._seq = tokens
|
||||||
|
#self.i = i
|
||||||
|
|
||||||
def __unicode__(self):
|
#def __unicode__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
cdef int end_idx = t.idx + t.lex.length
|
# cdef int end_idx = t.idx + t.lex.length
|
||||||
if self.i + 1 == self._seq.length:
|
# if self.i + 1 == self._seq.length:
|
||||||
return self.string
|
# return self.string
|
||||||
if end_idx == t[1].idx:
|
# if end_idx == t[1].idx:
|
||||||
return self.string
|
# return self.string
|
||||||
else:
|
# else:
|
||||||
return self.string + ' '
|
# return self.string + ' '
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of unicode code-points in the original string.
|
"""The number of unicode code-points in the original string.
|
||||||
|
@ -201,87 +232,87 @@ cdef class Token:
|
||||||
Returns:
|
Returns:
|
||||||
length (int):
|
length (int):
|
||||||
"""
|
"""
|
||||||
return self._seq.data[self.i].lex.length
|
return self.length
|
||||||
|
|
||||||
property idx:
|
#property idx:
|
||||||
"""The index into the original string at which the token starts.
|
# """The index into the original string at which the token starts.
|
||||||
|
|
||||||
The following is supposed to always be true:
|
# The following is supposed to always be true:
|
||||||
|
#
|
||||||
>>> original_string[token.idx:token.idx len(token) == token.string
|
# >>> original_string[token.idx:token.idx len(token) == token.string
|
||||||
"""
|
# """
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].idx
|
# return self._seq.data[self.i].idx
|
||||||
|
|
||||||
property cluster:
|
#property cluster:
|
||||||
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
# """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||||
|
#
|
||||||
Similar words have better-than-chance likelihood of having similar cluster
|
# Similar words have better-than-chance likelihood of having similar cluster
|
||||||
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
# IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||||
and help to make models slightly more robust to domain variation.
|
# and help to make models slightly more robust to domain variation.
|
||||||
|
|
||||||
A common trick is to use only the first N bits of a cluster ID in a feature,
|
# A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||||
as the more general part of the hierarchical clustering is often more accurate
|
# as the more general part of the hierarchical clustering is often more accurate
|
||||||
than the lower categories.
|
# than the lower categories.
|
||||||
|
|
||||||
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
# To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||||
bit-mask:
|
# bit-mask:
|
||||||
|
|
||||||
>>> six_bits = cluster & (2**6 - 1)
|
# >>> six_bits = cluster & (2**6 - 1)
|
||||||
"""
|
# """
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].lex.cluster
|
# return self._seq.data[self.i].lex.cluster
|
||||||
|
|
||||||
property string:
|
#property string:
|
||||||
"""The unicode string of the word, with no whitespace padding."""
|
# """The unicode string of the word, with no whitespace padding."""
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lex.sic == 0:
|
# if t.lex.sic == 0:
|
||||||
return ''
|
# return ''
|
||||||
cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
# cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic]
|
||||||
return utf8string.decode('utf8')
|
# return utf8string.decode('utf8')
|
||||||
|
|
||||||
property lemma:
|
#property lemma:
|
||||||
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
# """The unicode string of the word's lemma. If no part-of-speech tag is
|
||||||
assigned, the most common part-of-speech tag of the word is used.
|
# assigned, the most common part-of-speech tag of the word is used.
|
||||||
"""
|
# """
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lemma == 0:
|
# if t.lemma == 0:
|
||||||
return self.string
|
# return self.string
|
||||||
cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
# cdef bytes utf8string = self._seq.vocab.strings[t.lemma]
|
||||||
return utf8string.decode('utf8')
|
# return utf8string.decode('utf8')
|
||||||
|
|
||||||
property dep_tag:
|
#property dep_tag:
|
||||||
"""The ID integer of the word's dependency label. If no parse has been
|
# """The ID integer of the word's dependency label. If no parse has been
|
||||||
assigned, defaults to 0.
|
# assigned, defaults to 0.
|
||||||
"""
|
# """
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].dep_tag
|
# return self._seq.data[self.i].dep_tag
|
||||||
|
|
||||||
property pos:
|
#property pos:
|
||||||
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
# """The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||||
Google Universal Tag Set. Constants for this tag set are available in
|
# Google Universal Tag Set. Constants for this tag set are available in
|
||||||
spacy.typedefs.
|
# spacy.typedefs.
|
||||||
"""
|
# """
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].pos
|
# return self._seq.data[self.i].pos
|
||||||
|
|
||||||
property fine_pos:
|
#property fine_pos:
|
||||||
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
# """The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||||
by the tagger model. Fine-grained tags include morphological information,
|
# by the tagger model. Fine-grained tags include morphological information,
|
||||||
and other distinctions, and allow a more accurate tagger to be trained.
|
# and other distinctions, and allow a more accurate tagger to be trained.
|
||||||
"""
|
# """
|
||||||
|
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].fine_pos
|
# return self._seq.data[self.i].fine_pos
|
||||||
|
|
||||||
property sic:
|
#property sic:
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
return self._seq.data[self.i].lex.sic
|
# return self._seq.data[self.i].lex.sic
|
||||||
|
|
||||||
property head:
|
#property head:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
# """The token predicted by the parser to be the head of the current token."""
|
||||||
def __get__(self):
|
# def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
# cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
return Token(self._seq, self.i + t.head)
|
# return Token(self._seq, self.i + t.head)
|
||||||
|
|
|
@ -4,16 +4,16 @@ from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from .structs cimport Lexeme, TokenC, UniStr
|
from .structs cimport LexemeC, TokenC, UniStr
|
||||||
from .typedefs cimport utf8_t, id_t, hash_t
|
from .typedefs cimport utf8_t, id_t, hash_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme EMPTY_LEXEME
|
cdef LexemeC EMPTY_LEXEME
|
||||||
|
|
||||||
|
|
||||||
cdef union LexemesOrTokens:
|
cdef union LexemesOrTokens:
|
||||||
const Lexeme* const* lexemes
|
const LexemeC* const* lexemes
|
||||||
TokenC* tokens
|
TokenC* tokens
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,9 +27,9 @@ cdef class Vocab:
|
||||||
cpdef public get_lex_props
|
cpdef public get_lex_props
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef vector[Lexeme*] lexemes
|
cdef vector[LexemeC*] lexemes
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* s) except NULL
|
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||||
|
|
||||||
cdef PreshMap _map
|
cdef PreshMap _map
|
||||||
|
|
||||||
|
|
|
@ -2,20 +2,27 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
|
import codecs
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .lexeme cimport init as lexeme_init
|
from .lexeme cimport init as lexeme_init
|
||||||
from .strings cimport slice_unicode
|
from .strings cimport slice_unicode
|
||||||
from .typedefs cimport flags_t
|
from .strings cimport hash_string
|
||||||
from .orth cimport word_shape
|
from .orth cimport word_shape
|
||||||
|
|
||||||
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(Lexeme))
|
DEF MAX_VEC_SIZE = 100000
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
|
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
||||||
|
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
||||||
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
EMPTY_LEXEME.vec = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
|
cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||||
StringStore string_store, dict props) except *:
|
StringStore string_store, dict props) except *:
|
||||||
cdef Lexeme lex
|
cdef LexemeC lex
|
||||||
lex.id = i
|
lex.id = i
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.sic = string_store[string]
|
lex.sic = string_store[string]
|
||||||
|
@ -28,13 +35,12 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed,
|
||||||
lex.suffix = string_store[string[-3:]]
|
lex.suffix = string_store[string[-3:]]
|
||||||
lex.shape = string_store[word_shape(string)]
|
lex.shape = string_store[word_shape(string)]
|
||||||
|
|
||||||
cdef object flags_val = props.get('flags', 0)
|
lex.flags = props.get('flags', 0)
|
||||||
lex.flags = <flags_t>flags_val
|
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's Lexeme structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
def __init__(self, data_dir=None, get_lex_props=None):
|
def __init__(self, data_dir=None, get_lex_props=None):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -50,24 +56,25 @@ cdef class Vocab:
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
self.strings.load(path.join(data_dir, 'strings.txt'))
|
self.strings.load(path.join(data_dir, 'strings.txt'))
|
||||||
self.load(path.join(data_dir, 'lexemes.bin'))
|
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
|
||||||
|
#self.load_vectors(path.join(data_dir, 'deps.words'))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.lexemes.size()
|
return self.lexemes.size()
|
||||||
|
|
||||||
cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL:
|
||||||
'''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
|
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||||
cdef Lexeme* lex
|
cdef LexemeC* lex
|
||||||
lex = <Lexeme*>self._map.get(string.key)
|
lex = <LexemeC*>self._map.get(string.key)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
return lex
|
return lex
|
||||||
if string.n < 3:
|
if string.n < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
cdef unicode py_string = string.chars[:string.n]
|
cdef unicode py_string = string.chars[:string.n]
|
||||||
lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
|
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
|
||||||
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
|
lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings,
|
||||||
self.get_lex_props(py_string))
|
self.get_lex_props(py_string))
|
||||||
if mem is self.mem:
|
if mem is self.mem:
|
||||||
|
@ -81,13 +88,13 @@ cdef class Vocab:
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
unseen unicode string is given, a new Lexeme is created and stored.
|
unseen unicode string is given, a new LexemeC is created and stored.
|
||||||
|
|
||||||
This function relies on Cython's struct-to-dict conversion. Python clients
|
This function relies on Cython's struct-to-dict conversion. Python clients
|
||||||
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
receive a dict keyed by strings (byte or unicode, depending on Python 2/3),
|
||||||
with int values. Cython clients can instead receive a Lexeme struct value.
|
with int values. Cython clients can instead receive a LexemeC struct value.
|
||||||
More efficient Cython access is provided by Lexicon.get, which returns
|
More efficient Cython access is provided by Lexicon.get, which returns
|
||||||
a Lexeme*.
|
a LexemeC*.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||||
|
@ -96,24 +103,26 @@ cdef class Vocab:
|
||||||
is raised.
|
is raised.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (dict): A Lexeme struct instance, which Cython translates into
|
lexeme (dict): A LexemeC struct instance, which Cython translates into
|
||||||
a dict if the operator is called from Python.
|
a dict if the operator is called from Python.
|
||||||
'''
|
'''
|
||||||
if type(id_or_string) == int:
|
if type(id_or_string) == int:
|
||||||
if id_or_string >= self.lexemes.size():
|
if id_or_string >= self.lexemes.size():
|
||||||
raise IndexError
|
raise IndexError
|
||||||
return self.lexemes.at(id_or_string)[0]
|
return {}
|
||||||
|
#return self.lexemes.at(id_or_string)[0]
|
||||||
cdef UniStr string
|
cdef UniStr string
|
||||||
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
slice_unicode(&string, id_or_string, 0, len(id_or_string))
|
||||||
cdef const Lexeme* lexeme = self.get(self.mem, &string)
|
cdef const LexemeC* lexeme = self.get(self.mem, &string)
|
||||||
return lexeme[0]
|
return {}
|
||||||
|
#return lexeme[0]
|
||||||
|
|
||||||
def __setitem__(self, unicode uni_string, dict props):
|
def __setitem__(self, unicode uni_string, dict props):
|
||||||
cdef UniStr s
|
cdef UniStr s
|
||||||
slice_unicode(&s, uni_string, 0, len(uni_string))
|
slice_unicode(&s, uni_string, 0, len(uni_string))
|
||||||
# Cast through the const here, since we're allowed to change our own
|
# Cast through the const here, since we're allowed to change our own
|
||||||
# Lexemes.
|
# LexemeCs.
|
||||||
lex = <Lexeme*><void*>self.get(self.mem, &s)
|
lex = <LexemeC*><void*>self.get(self.mem, &s)
|
||||||
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props)
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
|
@ -128,30 +137,30 @@ cdef class Vocab:
|
||||||
key = self._map.c_map.cells[i].key
|
key = self._map.c_map.cells[i].key
|
||||||
if key == 0:
|
if key == 0:
|
||||||
continue
|
continue
|
||||||
lexeme = <Lexeme*>self._map.c_map.cells[i].value
|
lexeme = <LexemeC*>self._map.c_map.cells[i].value
|
||||||
st = fwrite(&key, sizeof(key), 1, fp)
|
st = fwrite(&key, sizeof(key), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fwrite(lexeme, sizeof(Lexeme), 1, fp)
|
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
assert st == 1
|
assert st == 1
|
||||||
st = fclose(fp)
|
st = fclose(fp)
|
||||||
assert st == 0
|
assert st == 0
|
||||||
|
|
||||||
def load(self, loc):
|
def load_lexemes(self, loc):
|
||||||
if not path.exists(loc):
|
if not path.exists(loc):
|
||||||
raise IOError('Lexemes file not found at %s' % loc)
|
raise IOError('LexemeCs file not found at %s' % loc)
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
cdef FILE* fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
assert fp != NULL
|
assert fp != NULL
|
||||||
cdef size_t st
|
cdef size_t st
|
||||||
cdef Lexeme* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
i = 0
|
i = 0
|
||||||
while True:
|
while True:
|
||||||
st = fread(&key, sizeof(key), 1, fp)
|
st = fread(&key, sizeof(key), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
st = fread(lexeme, sizeof(Lexeme), 1, fp)
|
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||||
if st != 1:
|
if st != 1:
|
||||||
break
|
break
|
||||||
self._map.set(key, lexeme)
|
self._map.set(key, lexeme)
|
||||||
|
@ -160,3 +169,29 @@ cdef class Vocab:
|
||||||
self.lexemes[lexeme.id] = lexeme
|
self.lexemes[lexeme.id] = lexeme
|
||||||
i += 1
|
i += 1
|
||||||
fclose(fp)
|
fclose(fp)
|
||||||
|
|
||||||
|
def load_vectors(self, loc):
|
||||||
|
cdef int i
|
||||||
|
cdef unicode line
|
||||||
|
cdef unicode word
|
||||||
|
cdef unicode val_str
|
||||||
|
cdef hash_t key
|
||||||
|
cdef LexemeC* lex
|
||||||
|
cdef float* vec
|
||||||
|
|
||||||
|
with codecs.open(loc, 'r', 'utf8') as file_:
|
||||||
|
for line in file_:
|
||||||
|
pieces = line.split()
|
||||||
|
word = pieces.pop(0)
|
||||||
|
if len(pieces) >= MAX_VEC_SIZE:
|
||||||
|
sizes = (len(pieces), MAX_VEC_SIZE)
|
||||||
|
msg = ("Your vector is %d elements."
|
||||||
|
"The compile-time limit is %d elements." % sizes)
|
||||||
|
raise ValueError(msg)
|
||||||
|
key = hash_string(word)
|
||||||
|
lex = <LexemeC*>self._map.get(key)
|
||||||
|
if lex is not NULL:
|
||||||
|
vec = <float*>self.mem.alloc(len(pieces), sizeof(float))
|
||||||
|
for i, val_str in enumerate(pieces):
|
||||||
|
vec[i] = float(val_str)
|
||||||
|
lex.vec = vec
|
||||||
|
|
Loading…
Reference in New Issue
Block a user