mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Only store LexemeC structs in the vocabulary, transforming them to Lexeme objects for output. Moving away from Lexeme objects for Tokens soon.
This commit is contained in:
parent
b5b31c6b6e
commit
5b1c651661
15
fabfile.py
vendored
15
fabfile.py
vendored
|
@ -1,14 +1,25 @@
|
||||||
|
import json
|
||||||
|
|
||||||
from fabric.api import local, run, lcd, cd, env
|
from fabric.api import local, run, lcd, cd, env
|
||||||
|
|
||||||
def make():
|
def make():
|
||||||
local('python setup.py build_ext --inplace')
|
local('python setup.py build_ext --inplace')
|
||||||
|
|
||||||
|
|
||||||
def clean():
|
def clean():
|
||||||
local('python setup.py clean --all')
|
local('python setup.py clean --all')
|
||||||
|
|
||||||
|
|
||||||
def docs():
|
def docs():
|
||||||
with lcd('docs'):
|
local('sphinx-build -b html docs/ .')
|
||||||
local('sphinx-build -b html . ./_build')
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
local('py.test -x')
|
local('py.test -x')
|
||||||
|
|
||||||
|
def sbox():
|
||||||
|
local('python sb_setup.py build_ext --inplace')
|
||||||
|
|
||||||
|
def sbclean():
|
||||||
|
local('python sb_setup.py clean --all')
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ from os import path
|
||||||
|
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from spacy.tokens import Tokens
|
from spacy.tokens import Tokens
|
||||||
|
from spacy.lexeme cimport LexemeC, lexeme_init
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
|
@ -76,9 +77,10 @@ cdef class Language:
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
"""
|
"""
|
||||||
assert string
|
|
||||||
|
|
||||||
cdef size_t length = len(string)
|
cdef size_t length = len(string)
|
||||||
|
if length == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
cdef size_t start = 0
|
cdef size_t start = 0
|
||||||
cdef size_t i = 0
|
cdef size_t i = 0
|
||||||
cdef Tokens tokens = self.tokens_class()
|
cdef Tokens tokens = self.tokens_class()
|
||||||
|
@ -162,10 +164,18 @@ cdef class Lexicon:
|
||||||
self.size = 0
|
self.size = 0
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
for string in words:
|
for string in words:
|
||||||
word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
|
prob = probs.get(string, 0.0)
|
||||||
case_stats.get(string, {}), tag_stats.get(string, {}),
|
cluster = clusters.get(string, 0.0)
|
||||||
self._string_features, self._flag_features)
|
cases = case_stats.get(string, {})
|
||||||
self._dict[string] = word
|
tags = tag_stats.get(string, {})
|
||||||
|
views = [string_view(string, prob, cluster, cases, tags)
|
||||||
|
for string_view in self._string_features]
|
||||||
|
flags = set()
|
||||||
|
for i, flag_feature in enumerate(self._flag_features):
|
||||||
|
if flag_feature(string, prob, cluster, cases, tags):
|
||||||
|
flags.add(i)
|
||||||
|
lexeme = lexeme_init(string, prob, cluster, views, flags)
|
||||||
|
self._dict[string] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
@ -177,14 +187,19 @@ cdef class Lexicon:
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): A reference to a lexical type.
|
lexeme (Lexeme): A reference to a lexical type.
|
||||||
"""
|
"""
|
||||||
cdef Lexeme lexeme
|
cdef LexemeC* lexeme
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
if string in self._dict:
|
if string in self._dict:
|
||||||
lexeme = self._dict[string]
|
return Lexeme(self._dict[string])
|
||||||
return lexeme
|
|
||||||
|
|
||||||
cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
|
views = [string_view(string, 0.0, 0, {}, {})
|
||||||
self._flag_features)
|
for string_view in self._string_features]
|
||||||
self._dict[string] = word
|
flags = set()
|
||||||
|
for i, flag_feature in enumerate(self._flag_features):
|
||||||
|
if flag_feature(string, 0.0, {}, {}):
|
||||||
|
flags.add(i)
|
||||||
|
|
||||||
|
lexeme = lexeme_init(string, 0, 0, views, flags)
|
||||||
|
self._dict[string] = <size_t>lexeme
|
||||||
self.size += 1
|
self.size += 1
|
||||||
return word
|
return Lexeme(<size_t>lexeme)
|
||||||
|
|
|
@ -49,23 +49,8 @@ cdef class Lexeme:
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
|
def __cinit__(self, size_t lexeme_addr):
|
||||||
dict tag_stats, list string_features, list flag_features):
|
self._c = <LexemeC*>lexeme_addr
|
||||||
views = []
|
|
||||||
cdef unicode view
|
|
||||||
for string_feature in string_features:
|
|
||||||
view = string_feature(string, prob, cluster, case_stats, tag_stats)
|
|
||||||
views.append(view)
|
|
||||||
|
|
||||||
flags = set()
|
|
||||||
for i, flag_feature in enumerate(flag_features):
|
|
||||||
if flag_feature(string, prob, case_stats, tag_stats):
|
|
||||||
if (1 << i):
|
|
||||||
flags.add(i)
|
|
||||||
self._c = lexeme_init(string, prob, cluster, views, flags)
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
|
||||||
lexeme_free(self._c)
|
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user