* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon.

This commit is contained in:
Matthew Honnibal 2014-09-11 12:28:38 +02:00
parent b5b31c6b6e
commit 5b1c651661
3 changed files with 43 additions and 32 deletions

15
fabfile.py vendored
View File

@ -1,14 +1,25 @@
import json
from fabric.api import local, run, lcd, cd, env
def make():
local('python setup.py build_ext --inplace')
def clean():
local('python setup.py clean --all')
def docs():
with lcd('docs'):
local('sphinx-build -b html . ./_build')
local('sphinx-build -b html docs/ .')
def test():
local('py.test -x')
def sbox():
local('python sb_setup.py build_ext --inplace')
def sbclean():
local('python sb_setup.py clean --all')

View File

@ -15,6 +15,7 @@ from os import path
from .util import read_lang_data
from spacy.tokens import Tokens
from spacy.lexeme cimport LexemeC, lexeme_init
cdef class Language:
@ -76,9 +77,10 @@ cdef class Language:
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
"""
assert string
cdef size_t length = len(string)
if length == 0:
return []
cdef size_t start = 0
cdef size_t i = 0
cdef Tokens tokens = self.tokens_class()
@ -162,10 +164,18 @@ cdef class Lexicon:
self.size = 0
cdef Lexeme word
for string in words:
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
case_stats.get(string, {}), tag_stats.get(string, {}),
self._string_features, self._flag_features)
self._dict[string] = word
prob = probs.get(string, 0.0)
cluster = clusters.get(string, 0.0)
cases = case_stats.get(string, {})
tags = tag_stats.get(string, {})
views = [string_view(string, prob, cluster, cases, tags)
for string_view in self._string_features]
flags = set()
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(string, prob, cluster, cases, tags):
flags.add(i)
lexeme = lexeme_init(string, prob, cluster, views, flags)
self._dict[string] = <size_t>lexeme
self.size += 1
cpdef Lexeme lookup(self, unicode string):
@ -177,14 +187,19 @@ cdef class Lexicon:
Returns:
lexeme (Lexeme): A reference to a lexical type.
"""
cdef Lexeme lexeme
cdef LexemeC* lexeme
assert len(string) != 0
if string in self._dict:
lexeme = self._dict[string]
return lexeme
return Lexeme(self._dict[string])
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
self._flag_features)
self._dict[string] = word
views = [string_view(string, 0.0, 0, {}, {})
for string_view in self._string_features]
flags = set()
for i, flag_feature in enumerate(self._flag_features):
if flag_feature(string, 0.0, {}, {}):
flags.add(i)
lexeme = lexeme_init(string, 0, 0, views, flags)
self._dict[string] = <size_t>lexeme
self.size += 1
return word
return Lexeme(<size_t>lexeme)

View File

@ -49,23 +49,8 @@ cdef class Lexeme:
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
"""
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
dict tag_stats, list string_features, list flag_features):
views = []
cdef unicode view
for string_feature in string_features:
view = string_feature(string, prob, cluster, case_stats, tag_stats)
views.append(view)
flags = set()
for i, flag_feature in enumerate(flag_features):
if flag_feature(string, prob, case_stats, tag_stats):
if (1 << i):
flags.add(i)
self._c = lexeme_init(string, prob, cluster, views, flags)
def __dealloc__(self):
lexeme_free(self._c)
def __cinit__(self, size_t lexeme_addr):
self._c = <LexemeC*>lexeme_addr
property string:
def __get__(self):