* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.

This commit is contained in:
Matthew Honnibal 2015-07-19 15:18:17 +02:00
parent 0973e2f107
commit 317cbbc015
16 changed files with 143 additions and 47 deletions

View File

@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc)
nlp.parser.model.end_training()
nlp.entity.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
nlp.end_training()
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None):

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals
from os import path
import re
import struct
import json
from .. import orth
from ..vocab import Vocab
@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
from ..serialize.bits import BitArray
from ..tokens import Doc
from ..multi_words import RegexMerger
@ -19,6 +22,8 @@ from . import regexes
from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string):
return {
@ -74,7 +79,7 @@ class English(object):
load_vectors=True
):
self._data_dir = data_dir
self.data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors,
@ -140,6 +145,29 @@ class English(object):
self.mwe_merger(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_:
json.dump(list(self.tagger.freqs[TAG].items()), file_)
with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_:
json.dump(list(self.parser.moves.freqs[HEAD].items()), file_)
with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_:
json.dump(list(self.parser.moves.freqs[DEP].items()), file_)
with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_:
json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_)
with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_:
json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_)
@property
def tags(self):
"""List of part-of-speech tag names."""

View File

@ -1,4 +1,5 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from .._ml cimport Model
@ -14,6 +15,7 @@ cdef class EnPosTagger:
cdef readonly Model model
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef public dict freqs
cdef PosTag* tags
cdef readonly object tag_names

View File

@ -7,6 +7,7 @@ from libc.string cimport memset
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer
@ -260,6 +261,7 @@ cdef class EnPosTagger:
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
@ -309,6 +311,7 @@ cdef class EnPosTagger:
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:

View File

@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray:
def __init__(self):
self.data = b''
def __init__(self, data=b''):
self.data = data
self.byte = 0
self.bit_of_byte = 0
self.i = 0

View File

@ -4,4 +4,5 @@ from ..vocab cimport Vocab
cdef class Packer:
cdef readonly tuple attrs
cdef readonly tuple _codecs
cdef readonly object lex_codec
cdef readonly Vocab vocab

View File

@ -8,15 +8,17 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..structs cimport LexemeC
from ..typedefs cimport attr_t
from .bits cimport BitArray
from .huffman cimport HuffmanCodec
from os import path
import numpy
from .. import util
cimport cython
@ -67,8 +69,8 @@ cdef class _AttributeCodec:
item.first = count
item.second = key
items.push(item)
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32)
self._keys = <attr_t*>self.mem.alloc(items.size(), sizeof(attr_t))
self._map = {}
cdef int i = 0
while not items.empty():
@ -94,21 +96,33 @@ cdef class _AttributeCodec:
dest[i] = <attr_t>self._keys[dest[i]]
cdef class Packer:
def __init__(self, Vocab vocab, list_of_attr_freqs):
self.vocab = vocab
codecs = []
attrs = []
def _gen_orths(Vocab vocab):
cdef attr_t orth
cdef size_t addr
for orth, addr in vocab._by_orth.items():
lex = <LexemeC*>addr
yield orth, c_exp(lex.prob)
for attr, freqs in list_of_attr_freqs:
if attr == SPACY:
codecs.append(_BinaryCodec())
else:
codecs.append(_AttributeCodec(freqs))
cdef class Packer:
def __init__(self, Vocab vocab, attr_freqs):
self.vocab = vocab
self.lex_codec = _AttributeCodec(_gen_orths(vocab))
codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()]
attrs = [ORTH, SPACY]
for attr, freqs in sorted(attr_freqs):
if attr in (ORTH, ID, SPACY):
continue
codecs.append(_AttributeCodec(freqs))
attrs.append(attr)
self._codecs = tuple(codecs)
self.attrs = tuple(attrs)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
return cls(vocab, util.read_encoding_freqs(data_dir))
def pack(self, Doc doc):
array = doc.to_array(self.attrs)
cdef BitArray bits = BitArray()
@ -124,6 +138,4 @@ cdef class Packer:
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs):
codec.decode(bits, array[:, i])
doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1])
doc.from_array(self.attrs, array)
return doc
return array

View File

@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label]
# Count frequencies, for use in encoder
self.freqs[HEAD][gold.c.heads[i] - i] += 1
self.freqs[DEP][gold.c.labels[i]] += 1
for end, brackets in gold.brackets.items():
for start, label_strs in brackets.items():
gold.c.brackets[start][end] = 1

View File

@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
from thinc.typedefs cimport weight_t
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from .stateclass cimport StateClass
@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem):
cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i])
# Count frequencies, for use in encoder
if gold.c.ner[i].move in (BEGIN, UNIT):
self.freqs[ENT_IOB][3] += 1
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
elif gold.c.ner[i].move in (IN, LAST):
self.freqs[ENT_IOB][2] += 1
self.freqs[ENT_TYPE][0] += 1
elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *:
if name == '-':

View File

@ -35,6 +35,7 @@ cdef class TransitionSystem:
cdef bint* _is_valid
cdef readonly int n_moves
cdef public int root_label
cdef public freqs
cdef int initialize_state(self, StateClass state) except -1
cdef int finalize_state(self, StateClass state) nogil

View File

@ -1,8 +1,10 @@
from cymem.cymem cimport Pool
from ..structs cimport TokenC
from thinc.typedefs cimport weight_t
from collections import defaultdict
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
cdef weight_t MIN_SCORE = -90000
@ -28,6 +30,9 @@ cdef class TransitionSystem:
i += 1
self.c = moves
self.root_label = self.strings['ROOT']
self.freqs = {}
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
cdef int initialize_state(self, StateClass state) except -1:
pass

View File

@ -2,6 +2,7 @@ cimport cython
from libc.string cimport memcpy, memset
import numpy
import struct
from ..lexeme cimport EMPTY_LEXEME
from ..strings cimport slice_unicode
@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span
from ..structs cimport UniStr
from .token cimport Token
from ..serialize.bits cimport BitArray
DEF PADDING = 5
@ -54,7 +56,7 @@ cdef class Doc:
Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__.
"""
def __init__(self, Vocab vocab):
def __init__(self, Vocab vocab, orths_and_spaces=None):
self.vocab = vocab
size = 20
self.mem = Pool()
@ -71,24 +73,17 @@ cdef class Doc:
self.is_tagged = False
self.is_parsed = False
self._py_tokens = []
@classmethod
def from_ids(cls, Vocab vocab, orths, spaces):
cdef int i
cdef const LexemeC* lex
cdef Doc self = cls(vocab)
cdef bint space = 0
cdef attr_t orth
for i in range(len(orths)):
orth = orths[i]
lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL:
assert lex.orth == orth
space = spaces[i]
self.push_back(lex, space)
else:
raise Exception('Lexeme not found: %d' % orth)
return self
cdef bint space
if orths_and_spaces is not None:
for orth, space in orths_and_spaces:
lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL:
assert lex.orth == orth
self.push_back(lex, space)
else:
raise Exception('Lexeme not found: %d' % orth)
def __getitem__(self, object i):
"""Get a token.
@ -389,3 +384,26 @@ cdef class Doc:
elif attr_id == ENT_TYPE:
for i in range(length):
tokens[i].ent_type = values[i]
def to_bytes(self):
bits = self.vocab.packer.pack(self)
return struct.pack('I', len(bits)) + bits.as_bytes()
@staticmethod
def from_bytes(Vocab vocab, file_):
keep_reading = True
while keep_reading:
try:
n_bits_str = file_.read(4)
if len(n_bits_str) < 4:
break
n_bits = struct.unpack('I', n_bits_str)[0]
n_bytes = n_bits // 8 + bool(n_bits % 8)
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
bits = BitArray(data)
array = vocab.packer.unpack(bits)
doc = Doc(vocab, array[:, :2])
doc.from_array(vocab.packer.attrs, array)
yield doc

View File

@ -2,6 +2,7 @@ from os import path
import codecs
import json
import re
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -64,7 +65,17 @@ def read_tokenization(lang):
return entries
def read_detoken_rules(lang):
def read_encoding_freqs(data_dir):
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
(ENT_TYPE, ne_types)]
def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize')
entries = []
with utf8open(loc) as file_:
@ -73,7 +84,7 @@ def read_detoken_rules(lang):
return entries
def align_tokens(ref, indices):
def align_tokens(ref, indices): # Deprecated, surely?
start = 0
queue = list(indices)
for token in ref:
@ -86,7 +97,7 @@ def align_tokens(ref, indices):
assert not queue
def detokenize(token_rules, words):
def detokenize(token_rules, words): # Deprecated?
"""To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g.

View File

@ -29,6 +29,7 @@ cdef class Vocab:
cpdef readonly StringStore strings
cdef readonly object pos_tags
cdef readonly int length
cdef public object packer
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1

View File

@ -16,6 +16,8 @@ from .orth cimport word_shape
from .typedefs cimport attr_t
from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
DEF MAX_VEC_SIZE = 100000
@ -53,6 +55,8 @@ cdef class Vocab:
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self.packer = Packer(self, util.read_encoding_freqs(data_dir))
def __len__(self):
"""The current number of lexemes stored."""
return self.length

View File

@ -5,7 +5,6 @@ import numpy
from spacy.vocab import Vocab
from spacy.serialize.packer import _BinaryCodec
from spacy.serialize.packer import make_vocab_codec
from spacy.serialize.packer import _AttributeCodec
from spacy.serialize.bits import BitArray