mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
* Moving to storing LexemeC structs internally
This commit is contained in:
parent
bf9c60c31c
commit
c8f7c8bfde
|
@ -17,6 +17,7 @@ cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
|
cdef size_t get(self, unicode string)
|
||||||
|
|
||||||
cdef dict _dict
|
cdef dict _dict
|
||||||
|
|
||||||
|
|
|
@ -102,7 +102,7 @@ cdef class Language:
|
||||||
substrings = self._split(string)
|
substrings = self._split(string)
|
||||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes[i] = self.lexicon.lookup(substring)._c
|
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self.cache[string] = <size_t>lexemes
|
self.cache[string] = <size_t>lexemes
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
|
@ -152,7 +152,7 @@ cdef class Language:
|
||||||
for string, substrings in token_rules:
|
for string, substrings in token_rules:
|
||||||
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
lexemes = <LexemeC**>calloc(len(substrings) + 1, sizeof(LexemeC*))
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes[i] = self.lexicon.lookup(substring)._c
|
lexemes[i] = <LexemeC*>self.lexicon.get(substring)
|
||||||
lexemes[i + 1] = NULL
|
lexemes[i + 1] = NULL
|
||||||
self.cache[string] = <size_t>lexemes
|
self.cache[string] = <size_t>lexemes
|
||||||
|
|
||||||
|
@ -180,19 +180,11 @@ cdef class Lexicon:
|
||||||
self._dict[string] = <size_t>lexeme
|
self._dict[string] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cdef size_t get(self, unicode string):
|
||||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
|
||||||
|
|
||||||
Args
|
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
|
||||||
"""
|
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
if string in self._dict:
|
if string in self._dict:
|
||||||
return Lexeme(self._dict[string])
|
return self._dict[string]
|
||||||
|
|
||||||
views = [string_view(string, 0.0, 0, {}, {})
|
views = [string_view(string, 0.0, 0, {}, {})
|
||||||
for string_view in self._string_features]
|
for string_view in self._string_features]
|
||||||
|
@ -204,4 +196,16 @@ cdef class Lexicon:
|
||||||
lexeme = lexeme_init(string, 0, 0, views, flags)
|
lexeme = lexeme_init(string, 0, 0, views, flags)
|
||||||
self._dict[string] = <size_t>lexeme
|
self._dict[string] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return Lexeme(<size_t>lexeme)
|
return <size_t>lexeme
|
||||||
|
|
||||||
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
"""Retrieve (or create, if not found) a Lexeme for a string, and return it.
|
||||||
|
|
||||||
|
Args
|
||||||
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
|
"""
|
||||||
|
cdef size_t lexeme = self.get(string)
|
||||||
|
return Lexeme(lexeme)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user