mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Moving to Word objects in place of the Lexeme struct.
This commit is contained in:
parent
782806df08
commit
4f01df9152
|
@ -11,8 +11,6 @@ Special-case tokenization rules are read from data/<lang>/tokenization .
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libcpp.pair cimport pair
|
|
||||||
from cython.operator cimport dereference as deref
|
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
|
@ -61,19 +59,17 @@ cdef class Language:
|
||||||
cdef Word lookup(self, unicode string):
|
cdef Word lookup(self, unicode string):
|
||||||
assert len(string) != 0
|
assert len(string) != 0
|
||||||
cdef Word word
|
cdef Word word
|
||||||
cdef StringHash h = hash(string)
|
if string in self.vocab:
|
||||||
if h in self.vocab:
|
word = self.vocab[string]
|
||||||
word = self.vocab[h]
|
|
||||||
else:
|
else:
|
||||||
word = self.new_lexeme(string)
|
word = self.new_lexeme(string)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
cdef list lookup_chunk(self, unicode string):
|
cdef list lookup_chunk(self, unicode string):
|
||||||
cdef StringHash h = hash(string)
|
|
||||||
cdef list chunk
|
cdef list chunk
|
||||||
cdef size_t chunk_id
|
cdef size_t chunk_id
|
||||||
if h in self.chunks:
|
if string in self.chunks:
|
||||||
chunk = self.chunks[h]
|
chunk = self.chunks[string]
|
||||||
else:
|
else:
|
||||||
chunk = self.new_chunk(string, self.find_substrings(string))
|
chunk = self.new_chunk(string, self.find_substrings(string))
|
||||||
return chunk
|
return chunk
|
||||||
|
@ -82,15 +78,14 @@ cdef class Language:
|
||||||
chunk = []
|
chunk = []
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
chunk.append(self.lookup(substring))
|
chunk.append(self.lookup(substring))
|
||||||
cdef StringHash h = hash(string)
|
self.chunks[string] = chunk
|
||||||
self.chunks[h] = chunk
|
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
cdef Word new_lexeme(self, unicode string):
|
cdef Word new_lexeme(self, unicode string):
|
||||||
string_views = [view_func(string) for view_func in self.view_funcs]
|
string_views = [view_func(string) for view_func in self.view_funcs]
|
||||||
word = Word(string.encode('utf8'), string_views)
|
word = Word(string.encode('utf8'), string_views)
|
||||||
self.bacov[word.lex] = string
|
self.bacov[word.lex] = string
|
||||||
self.vocab[word.lex] = word
|
self.vocab[string] = word
|
||||||
return word
|
return word
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user