* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon.

This commit is contained in:
Matthew Honnibal 2014-09-11 12:28:38 +02:00
parent b5b31c6b6e
commit 5b1c651661
3 changed files with 43 additions and 32 deletions

15
fabfile.py vendored
View File

@ -1,14 +1,25 @@
import json
from fabric.api import local, run, lcd, cd, env from fabric.api import local, run, lcd, cd, env
def make(): def make():
local('python setup.py build_ext --inplace') local('python setup.py build_ext --inplace')
def clean(): def clean():
local('python setup.py clean --all') local('python setup.py clean --all')
def docs(): def docs():
with lcd('docs'): local('sphinx-build -b html docs/ .')
local('sphinx-build -b html . ./_build')
def test(): def test():
local('py.test -x') local('py.test -x')
def sbox():
local('python sb_setup.py build_ext --inplace')
def sbclean():
local('python sb_setup.py clean --all')

View File

@ -15,6 +15,7 @@ from os import path
from .util import read_lang_data from .util import read_lang_data
from spacy.tokens import Tokens from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init
cdef class Language: cdef class Language:
@ -76,9 +77,10 @@ cdef class Language:
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
""" """
assert string
cdef size_t length = len(string) cdef size_t length = len(string)
if length == 0:
return []
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
cdef Tokens tokens = self.tokens_class() cdef Tokens tokens = self.tokens_class()
@ -162,10 +164,18 @@ cdef class Lexicon:
self.size = 0 self.size = 0
cdef Lexeme word cdef Lexeme word
for string in words: for string in words:
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0), prob = probs.get(string, 0.0)
case_stats.get(string, {}), tag_stats.get(string, {}), cluster = clusters.get(string, 0.0)
self._string_features, self._flag_features) cases = case_stats.get(string, {})
self._dict[string] = word tags = tag_stats.get(string, {})
views = [string_view(string, prob, cluster, cases, tags)
for string_view in self._string_features]
flags = set()
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(string, prob, cluster, cases, tags):
flags.add(i)
lexeme = lexeme_init(string, prob, cluster, views, flags)
self._dict[string] = <size_t>lexeme
self.size += 1 self.size += 1
cpdef Lexeme lookup(self, unicode string): cpdef Lexeme lookup(self, unicode string):
@ -177,14 +187,19 @@ cdef class Lexicon:
Returns: Returns:
lexeme (Lexeme): A reference to a lexical type. lexeme (Lexeme): A reference to a lexical type.
""" """
cdef Lexeme lexeme cdef LexemeC* lexeme
assert len(string) != 0 assert len(string) != 0
if string in self._dict: if string in self._dict:
lexeme = self._dict[string] return Lexeme(self._dict[string])
return lexeme
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features, views = [string_view(string, 0.0, 0, {}, {})
self._flag_features) for string_view in self._string_features]
self._dict[string] = word flags = set()
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(string, 0.0, {}, {}):
flags.add(i)
lexeme = lexeme_init(string, 0, 0, views, flags)
self._dict[string] = <size_t>lexeme
self.size += 1 self.size += 1
return word return Lexeme(<size_t>lexeme)

View File

@ -49,23 +49,8 @@ cdef class Lexeme:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats, def __cinit__(self, size_t lexeme_addr):
dict tag_stats, list string_features, list flag_features): self._c = <LexemeC*>lexeme_addr
views = []
cdef unicode view
for string_feature in string_features:
view = string_feature(string, prob, cluster, case_stats, tag_stats)
views.append(view)
flags = set()
for i, flag_feature in enumerate(flag_features):
if flag_feature(string, prob, case_stats, tag_stats):
if (1 << i):
flags.add(i)
self._c = lexeme_init(string, prob, cluster, views, flags)
def __dealloc__(self):
lexeme_free(self._c)
property string: property string:
def __get__(self): def __get__(self):