mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Changes to Doc and Token for new string store scheme
This commit is contained in:
parent
78f19baafa
commit
99de44d864
|
@ -133,6 +133,12 @@ cdef class Doc:
|
||||||
self.push_back(
|
self.push_back(
|
||||||
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
<const LexemeC*>self.vocab.get(self.mem, orth), has_space)
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self.mem is not None \
|
||||||
|
and self.vocab is not None \
|
||||||
|
and self.vocab.strings is not None:
|
||||||
|
self.vocab.strings.remove_oov_map(self.mem)
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
'''
|
'''
|
||||||
doc[i]
|
doc[i]
|
||||||
|
@ -600,14 +606,14 @@ cdef class Doc:
|
||||||
if tag in self.vocab.morphology.tag_map:
|
if tag in self.vocab.morphology.tag_map:
|
||||||
self.vocab.morphology.assign_tag(token, tag)
|
self.vocab.morphology.assign_tag(token, tag)
|
||||||
else:
|
else:
|
||||||
token.tag = self.vocab.strings[tag]
|
token.tag = self.vocab.strings.intern(tag)
|
||||||
token.lemma = self.vocab.strings[lemma]
|
token.lemma = self.vocab.strings.intern(lemma, mem=self.mem)
|
||||||
if ent_type == 'O':
|
if ent_type == 'O':
|
||||||
token.ent_iob = 2
|
token.ent_iob = 2
|
||||||
token.ent_type = 0
|
token.ent_type = 0
|
||||||
else:
|
else:
|
||||||
token.ent_iob = 3
|
token.ent_iob = 3
|
||||||
token.ent_type = self.vocab.strings[ent_type]
|
token.ent_type = self.vocab.strings.intern(ent_type)
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a dependency
|
# Before thinking of something simpler, beware the case where a dependency
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
|
@ -438,19 +440,19 @@ cdef class Token:
|
||||||
|
|
||||||
property orth_:
|
property orth_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.doc.mem)
|
||||||
|
|
||||||
property lower_:
|
property lower_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.doc.mem)
|
||||||
|
|
||||||
property norm_:
|
property norm_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.doc.mem)
|
||||||
|
|
||||||
property shape_:
|
property shape_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.doc.mem)
|
||||||
|
|
||||||
property prefix_:
|
property prefix_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -462,11 +464,11 @@ cdef class Token:
|
||||||
|
|
||||||
property lang_:
|
property lang_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.doc.mem)
|
||||||
|
|
||||||
property lemma_:
|
property lemma_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.lemma, mem=self.doc.mem)
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -474,7 +476,7 @@ cdef class Token:
|
||||||
|
|
||||||
property tag_:
|
property tag_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.vocab.strings.decode_int(self.c.tag, mem=self.mem)
|
return self.vocab.strings.decode_int(self.c.tag, mem=self.doc.mem)
|
||||||
|
|
||||||
property dep_:
|
property dep_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user