mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.
This commit is contained in:
parent
0973e2f107
commit
317cbbc015
|
@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||||
scorer.tags_acc,
|
scorer.tags_acc,
|
||||||
scorer.token_acc)
|
scorer.token_acc)
|
||||||
nlp.parser.model.end_training()
|
nlp.end_training()
|
||||||
nlp.entity.model.end_training()
|
|
||||||
nlp.tagger.model.end_training()
|
|
||||||
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||||
beam_width=None):
|
beam_width=None):
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from os import path
|
from os import path
|
||||||
import re
|
import re
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
|
||||||
from .. import orth
|
from .. import orth
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
|
||||||
from ..syntax.arc_eager import ArcEager
|
from ..syntax.arc_eager import ArcEager
|
||||||
from ..syntax.ner import BiluoPushDown
|
from ..syntax.ner import BiluoPushDown
|
||||||
from ..syntax.parser import ParserFactory
|
from ..syntax.parser import ParserFactory
|
||||||
|
from ..serialize.bits import BitArray
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..multi_words import RegexMerger
|
from ..multi_words import RegexMerger
|
||||||
|
@ -19,6 +22,8 @@ from . import regexes
|
||||||
|
|
||||||
from ..util import read_lang_data
|
from ..util import read_lang_data
|
||||||
|
|
||||||
|
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {
|
return {
|
||||||
|
@ -74,7 +79,7 @@ class English(object):
|
||||||
load_vectors=True
|
load_vectors=True
|
||||||
):
|
):
|
||||||
|
|
||||||
self._data_dir = data_dir
|
self.data_dir = data_dir
|
||||||
|
|
||||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||||
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
||||||
|
@ -140,6 +145,29 @@ class English(object):
|
||||||
self.mwe_merger(tokens)
|
self.mwe_merger(tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def end_training(self, data_dir=None):
|
||||||
|
if data_dir is None:
|
||||||
|
data_dir = self.data_dir
|
||||||
|
self.parser.model.end_training()
|
||||||
|
self.entity.model.end_training()
|
||||||
|
self.tagger.model.end_training()
|
||||||
|
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||||
|
|
||||||
|
with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_:
|
||||||
|
json.dump(list(self.tagger.freqs[TAG].items()), file_)
|
||||||
|
|
||||||
|
with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_:
|
||||||
|
json.dump(list(self.parser.moves.freqs[HEAD].items()), file_)
|
||||||
|
|
||||||
|
with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_:
|
||||||
|
json.dump(list(self.parser.moves.freqs[DEP].items()), file_)
|
||||||
|
|
||||||
|
with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_:
|
||||||
|
json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_)
|
||||||
|
|
||||||
|
with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_:
|
||||||
|
json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tags(self):
|
def tags(self):
|
||||||
"""List of part-of-speech tag names."""
|
"""List of part-of-speech tag names."""
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from preshed.maps cimport PreshMapArray
|
from preshed.maps cimport PreshMapArray
|
||||||
|
from preshed.counter cimport PreshCounter
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .._ml cimport Model
|
from .._ml cimport Model
|
||||||
|
@ -14,6 +15,7 @@ cdef class EnPosTagger:
|
||||||
cdef readonly Model model
|
cdef readonly Model model
|
||||||
cdef public object lemmatizer
|
cdef public object lemmatizer
|
||||||
cdef PreshMapArray _morph_cache
|
cdef PreshMapArray _morph_cache
|
||||||
|
cdef public dict freqs
|
||||||
|
|
||||||
cdef PosTag* tags
|
cdef PosTag* tags
|
||||||
cdef readonly object tag_names
|
cdef readonly object tag_names
|
||||||
|
|
|
@ -7,6 +7,7 @@ from libc.string cimport memset
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from thinc.typedefs cimport atom_t, weight_t
|
from thinc.typedefs cimport atom_t, weight_t
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||||
|
@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc
|
||||||
from ..morphology cimport set_morph_from_dict
|
from ..morphology cimport set_morph_from_dict
|
||||||
from .._ml cimport arg_max
|
from .._ml cimport arg_max
|
||||||
|
|
||||||
from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
|
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
@ -260,6 +261,7 @@ cdef class EnPosTagger:
|
||||||
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
|
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
|
||||||
'morphs.json'))))
|
'morphs.json'))))
|
||||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||||
|
self.freqs = {TAG: defaultdict(int)}
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
@ -309,6 +311,7 @@ cdef class EnPosTagger:
|
||||||
tokens.data[i].tag = self.strings[self.tag_names[guess]]
|
tokens.data[i].tag = self.strings[self.tag_names[guess]]
|
||||||
self.set_morph(i, &self.tags[guess], tokens.data)
|
self.set_morph(i, &self.tags[guess], tokens.data)
|
||||||
correct += loss == 0
|
correct += loss == 0
|
||||||
|
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||||
return correct
|
return correct
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
|
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
|
||||||
|
|
|
@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class BitArray:
|
cdef class BitArray:
|
||||||
def __init__(self):
|
def __init__(self, data=b''):
|
||||||
self.data = b''
|
self.data = data
|
||||||
self.byte = 0
|
self.byte = 0
|
||||||
self.bit_of_byte = 0
|
self.bit_of_byte = 0
|
||||||
self.i = 0
|
self.i = 0
|
||||||
|
|
|
@ -4,4 +4,5 @@ from ..vocab cimport Vocab
|
||||||
cdef class Packer:
|
cdef class Packer:
|
||||||
cdef readonly tuple attrs
|
cdef readonly tuple attrs
|
||||||
cdef readonly tuple _codecs
|
cdef readonly tuple _codecs
|
||||||
|
cdef readonly object lex_codec
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -8,15 +8,17 @@ from libcpp.pair cimport pair
|
||||||
from cymem.cymem cimport Address, Pool
|
from cymem.cymem cimport Address, Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
from ..structs cimport LexemeC
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from .bits cimport BitArray
|
from .bits cimport BitArray
|
||||||
from .huffman cimport HuffmanCodec
|
from .huffman cimport HuffmanCodec
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
import numpy
|
import numpy
|
||||||
|
from .. import util
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
@ -67,8 +69,8 @@ cdef class _AttributeCodec:
|
||||||
item.first = count
|
item.first = count
|
||||||
item.second = key
|
item.second = key
|
||||||
items.push(item)
|
items.push(item)
|
||||||
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
|
weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32)
|
||||||
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
self._keys = <attr_t*>self.mem.alloc(items.size(), sizeof(attr_t))
|
||||||
self._map = {}
|
self._map = {}
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
while not items.empty():
|
while not items.empty():
|
||||||
|
@ -94,21 +96,33 @@ cdef class _AttributeCodec:
|
||||||
dest[i] = <attr_t>self._keys[dest[i]]
|
dest[i] = <attr_t>self._keys[dest[i]]
|
||||||
|
|
||||||
|
|
||||||
cdef class Packer:
|
def _gen_orths(Vocab vocab):
|
||||||
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
cdef attr_t orth
|
||||||
self.vocab = vocab
|
cdef size_t addr
|
||||||
codecs = []
|
for orth, addr in vocab._by_orth.items():
|
||||||
attrs = []
|
lex = <LexemeC*>addr
|
||||||
|
yield orth, c_exp(lex.prob)
|
||||||
|
|
||||||
for attr, freqs in list_of_attr_freqs:
|
|
||||||
if attr == SPACY:
|
cdef class Packer:
|
||||||
codecs.append(_BinaryCodec())
|
def __init__(self, Vocab vocab, attr_freqs):
|
||||||
else:
|
self.vocab = vocab
|
||||||
|
self.lex_codec = _AttributeCodec(_gen_orths(vocab))
|
||||||
|
|
||||||
|
codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()]
|
||||||
|
attrs = [ORTH, SPACY]
|
||||||
|
for attr, freqs in sorted(attr_freqs):
|
||||||
|
if attr in (ORTH, ID, SPACY):
|
||||||
|
continue
|
||||||
codecs.append(_AttributeCodec(freqs))
|
codecs.append(_AttributeCodec(freqs))
|
||||||
attrs.append(attr)
|
attrs.append(attr)
|
||||||
self._codecs = tuple(codecs)
|
self._codecs = tuple(codecs)
|
||||||
self.attrs = tuple(attrs)
|
self.attrs = tuple(attrs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, Vocab vocab, data_dir):
|
||||||
|
return cls(vocab, util.read_encoding_freqs(data_dir))
|
||||||
|
|
||||||
def pack(self, Doc doc):
|
def pack(self, Doc doc):
|
||||||
array = doc.to_array(self.attrs)
|
array = doc.to_array(self.attrs)
|
||||||
cdef BitArray bits = BitArray()
|
cdef BitArray bits = BitArray()
|
||||||
|
@ -124,6 +138,4 @@ cdef class Packer:
|
||||||
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
|
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
|
||||||
for i, codec in enumerate(self._codecs):
|
for i, codec in enumerate(self._codecs):
|
||||||
codec.decode(bits, array[:, i])
|
codec.decode(bits, array[:, i])
|
||||||
doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
|
return array
|
||||||
doc.from_array(self.attrs, array)
|
|
||||||
return doc
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
gold.c.heads[i] = gold.heads[i]
|
gold.c.heads[i] = gold.heads[i]
|
||||||
gold.c.labels[i] = self.strings[label]
|
gold.c.labels[i] = self.strings[label]
|
||||||
|
# Count frequencies, for use in encoder
|
||||||
|
self.freqs[HEAD][gold.c.heads[i] - i] += 1
|
||||||
|
self.freqs[DEP][gold.c.labels[i]] += 1
|
||||||
for end, brackets in gold.brackets.items():
|
for end, brackets in gold.brackets.items():
|
||||||
for start, label_strs in brackets.items():
|
for start, label_strs in brackets.items():
|
||||||
gold.c.brackets[start][end] = 1
|
gold.c.brackets[start][end] = 1
|
||||||
|
|
|
@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
|
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
|
||||||
|
@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
|
||||||
|
# Count frequencies, for use in encoder
|
||||||
|
if gold.c.ner[i].move in (BEGIN, UNIT):
|
||||||
|
self.freqs[ENT_IOB][3] += 1
|
||||||
|
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
|
||||||
|
elif gold.c.ner[i].move in (IN, LAST):
|
||||||
|
self.freqs[ENT_IOB][2] += 1
|
||||||
|
self.freqs[ENT_TYPE][0] += 1
|
||||||
|
elif gold.c.ner[i].move == OUT:
|
||||||
|
self.freqs[ENT_IOB][1] += 1
|
||||||
|
self.freqs[ENT_TYPE][0] += 1
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
if name == '-':
|
if name == '-':
|
||||||
|
|
|
@ -35,6 +35,7 @@ cdef class TransitionSystem:
|
||||||
cdef bint* _is_valid
|
cdef bint* _is_valid
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
cdef public int root_label
|
cdef public int root_label
|
||||||
|
cdef public freqs
|
||||||
|
|
||||||
cdef int initialize_state(self, StateClass state) except -1
|
cdef int initialize_state(self, StateClass state) except -1
|
||||||
cdef int finalize_state(self, StateClass state) nogil
|
cdef int finalize_state(self, StateClass state) nogil
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from ..structs cimport TokenC
|
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -28,6 +30,9 @@ cdef class TransitionSystem:
|
||||||
i += 1
|
i += 1
|
||||||
self.c = moves
|
self.c = moves
|
||||||
self.root_label = self.strings['ROOT']
|
self.root_label = self.strings['ROOT']
|
||||||
|
self.freqs = {}
|
||||||
|
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
|
||||||
|
self.freqs[attr] = defaultdict(int)
|
||||||
|
|
||||||
cdef int initialize_state(self, StateClass state) except -1:
|
cdef int initialize_state(self, StateClass state) except -1:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -2,6 +2,7 @@ cimport cython
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
import struct
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..strings cimport slice_unicode
|
from ..strings cimport slice_unicode
|
||||||
|
@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr
|
||||||
from .spans import Span
|
from .spans import Span
|
||||||
from ..structs cimport UniStr
|
from ..structs cimport UniStr
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
from ..serialize.bits cimport BitArray
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -54,7 +56,7 @@ cdef class Doc:
|
||||||
Container class for annotated text. Constructed via English.__call__ or
|
Container class for annotated text. Constructed via English.__call__ or
|
||||||
Tokenizer.__call__.
|
Tokenizer.__call__.
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab):
|
def __init__(self, Vocab vocab, orths_and_spaces=None):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
size = 20
|
size = 20
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -71,24 +73,17 @@ cdef class Doc:
|
||||||
self.is_tagged = False
|
self.is_tagged = False
|
||||||
self.is_parsed = False
|
self.is_parsed = False
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_ids(cls, Vocab vocab, orths, spaces):
|
|
||||||
cdef int i
|
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef Doc self = cls(vocab)
|
|
||||||
cdef bint space = 0
|
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
for i in range(len(orths)):
|
cdef bint space
|
||||||
orth = orths[i]
|
if orths_and_spaces is not None:
|
||||||
|
for orth, space in orths_and_spaces:
|
||||||
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
lex = <LexemeC*>self.vocab._by_orth.get(orth)
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
assert lex.orth == orth
|
assert lex.orth == orth
|
||||||
space = spaces[i]
|
|
||||||
self.push_back(lex, space)
|
self.push_back(lex, space)
|
||||||
else:
|
else:
|
||||||
raise Exception('Lexeme not found: %d' % orth)
|
raise Exception('Lexeme not found: %d' % orth)
|
||||||
return self
|
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Get a token.
|
"""Get a token.
|
||||||
|
@ -389,3 +384,26 @@ cdef class Doc:
|
||||||
elif attr_id == ENT_TYPE:
|
elif attr_id == ENT_TYPE:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
tokens[i].ent_type = values[i]
|
tokens[i].ent_type = values[i]
|
||||||
|
|
||||||
|
def to_bytes(self):
|
||||||
|
bits = self.vocab.packer.pack(self)
|
||||||
|
return struct.pack('I', len(bits)) + bits.as_bytes()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_bytes(Vocab vocab, file_):
|
||||||
|
keep_reading = True
|
||||||
|
while keep_reading:
|
||||||
|
try:
|
||||||
|
n_bits_str = file_.read(4)
|
||||||
|
if len(n_bits_str) < 4:
|
||||||
|
break
|
||||||
|
n_bits = struct.unpack('I', n_bits_str)[0]
|
||||||
|
n_bytes = n_bits // 8 + bool(n_bits % 8)
|
||||||
|
data = file_.read(n_bytes)
|
||||||
|
except StopIteration:
|
||||||
|
keep_reading = False
|
||||||
|
bits = BitArray(data)
|
||||||
|
array = vocab.packer.unpack(bits)
|
||||||
|
doc = Doc(vocab, array[:, :2])
|
||||||
|
doc.from_array(vocab.packer.attrs, array)
|
||||||
|
yield doc
|
||||||
|
|
|
@ -2,6 +2,7 @@ from os import path
|
||||||
import codecs
|
import codecs
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
|
||||||
|
|
||||||
|
@ -64,7 +65,17 @@ def read_tokenization(lang):
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def read_detoken_rules(lang):
|
def read_encoding_freqs(data_dir):
|
||||||
|
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
|
||||||
|
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
|
||||||
|
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
|
||||||
|
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
|
||||||
|
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
|
||||||
|
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
|
||||||
|
(ENT_TYPE, ne_types)]
|
||||||
|
|
||||||
|
|
||||||
|
def read_detoken_rules(lang): # Deprecated?
|
||||||
loc = path.join(DATA_DIR, lang, 'detokenize')
|
loc = path.join(DATA_DIR, lang, 'detokenize')
|
||||||
entries = []
|
entries = []
|
||||||
with utf8open(loc) as file_:
|
with utf8open(loc) as file_:
|
||||||
|
@ -73,7 +84,7 @@ def read_detoken_rules(lang):
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def align_tokens(ref, indices):
|
def align_tokens(ref, indices): # Deprecated, surely?
|
||||||
start = 0
|
start = 0
|
||||||
queue = list(indices)
|
queue = list(indices)
|
||||||
for token in ref:
|
for token in ref:
|
||||||
|
@ -86,7 +97,7 @@ def align_tokens(ref, indices):
|
||||||
assert not queue
|
assert not queue
|
||||||
|
|
||||||
|
|
||||||
def detokenize(token_rules, words):
|
def detokenize(token_rules, words): # Deprecated?
|
||||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||||
chunk should be a tuple of token indices, e.g.
|
chunk should be a tuple of token indices, e.g.
|
||||||
|
|
|
@ -29,6 +29,7 @@ cdef class Vocab:
|
||||||
cpdef readonly StringStore strings
|
cpdef readonly StringStore strings
|
||||||
cdef readonly object pos_tags
|
cdef readonly object pos_tags
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
|
cdef public object packer
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||||
|
|
|
@ -16,6 +16,8 @@ from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
|
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
|
from . import util
|
||||||
|
from .serialize.packer cimport Packer
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_VEC_SIZE = 100000
|
DEF MAX_VEC_SIZE = 100000
|
||||||
|
@ -53,6 +55,8 @@ cdef class Vocab:
|
||||||
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
|
|
||||||
|
self.packer = Packer(self, util.read_encoding_freqs(data_dir))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
|
@ -5,7 +5,6 @@ import numpy
|
||||||
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.serialize.packer import _BinaryCodec
|
from spacy.serialize.packer import _BinaryCodec
|
||||||
from spacy.serialize.packer import make_vocab_codec
|
|
||||||
from spacy.serialize.packer import _AttributeCodec
|
from spacy.serialize.packer import _AttributeCodec
|
||||||
from spacy.serialize.bits import BitArray
|
from spacy.serialize.bits import BitArray
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user