* Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time.

This commit is contained in:
Matthew Honnibal 2015-07-19 15:18:17 +02:00
parent 0973e2f107
commit 317cbbc015
16 changed files with 143 additions and 47 deletions

View File

@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc, scorer.tags_acc,
scorer.token_acc) scorer.token_acc)
nlp.parser.model.end_training() nlp.end_training()
nlp.entity.model.end_training()
nlp.tagger.model.end_training()
nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt'))
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None): beam_width=None):

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from os import path from os import path
import re import re
import struct
import json
from .. import orth from .. import orth
from ..vocab import Vocab from ..vocab import Vocab
@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory from ..syntax.parser import ParserFactory
from ..serialize.bits import BitArray
from ..tokens import Doc from ..tokens import Doc
from ..multi_words import RegexMerger from ..multi_words import RegexMerger
@ -19,6 +22,8 @@ from . import regexes
from ..util import read_lang_data from ..util import read_lang_data
from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
def get_lex_props(string): def get_lex_props(string):
return { return {
@ -74,7 +79,7 @@ class English(object):
load_vectors=True load_vectors=True
): ):
self._data_dir = data_dir self.data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors, get_lex_props=get_lex_props, load_vectors=load_vectors,
@ -140,6 +145,29 @@ class English(object):
self.mwe_merger(tokens) self.mwe_merger(tokens)
return tokens return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_:
json.dump(list(self.tagger.freqs[TAG].items()), file_)
with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_:
json.dump(list(self.parser.moves.freqs[HEAD].items()), file_)
with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_:
json.dump(list(self.parser.moves.freqs[DEP].items()), file_)
with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_:
json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_)
with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_:
json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_)
@property @property
def tags(self): def tags(self):
"""List of part-of-speech tag names.""" """List of part-of-speech tag names."""

View File

@ -1,4 +1,5 @@
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .._ml cimport Model from .._ml cimport Model
@ -14,6 +15,7 @@ cdef class EnPosTagger:
cdef readonly Model model cdef readonly Model model
cdef public object lemmatizer cdef public object lemmatizer
cdef PreshMapArray _morph_cache cdef PreshMapArray _morph_cache
cdef public dict freqs
cdef PosTag* tags cdef PosTag* tags
cdef readonly object tag_names cdef readonly object tag_names

View File

@ -7,6 +7,7 @@ from libc.string cimport memset
from cymem.cymem cimport Address from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max from .._ml cimport arg_max
from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
@ -260,6 +261,7 @@ cdef class EnPosTagger:
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json')))) 'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.
@ -309,6 +311,7 @@ cdef class EnPosTagger:
tokens.data[i].tag = self.strings[self.tag_names[guess]] tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data) self.set_morph(i, &self.tags[guess], tokens.data)
correct += loss == 0 correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct return correct
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:

View File

@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray: cdef class BitArray:
def __init__(self): def __init__(self, data=b''):
self.data = b'' self.data = data
self.byte = 0 self.byte = 0
self.bit_of_byte = 0 self.bit_of_byte = 0
self.i = 0 self.i = 0

View File

@ -4,4 +4,5 @@ from ..vocab cimport Vocab
cdef class Packer: cdef class Packer:
cdef readonly tuple attrs cdef readonly tuple attrs
cdef readonly tuple _codecs cdef readonly tuple _codecs
cdef readonly object lex_codec
cdef readonly Vocab vocab cdef readonly Vocab vocab

View File

@ -8,15 +8,17 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..structs cimport LexemeC
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from .bits cimport BitArray from .bits cimport BitArray
from .huffman cimport HuffmanCodec from .huffman cimport HuffmanCodec
from os import path from os import path
import numpy import numpy
from .. import util
cimport cython cimport cython
@ -67,8 +69,8 @@ cdef class _AttributeCodec:
item.first = count item.first = count
item.second = key item.second = key
items.push(item) items.push(item)
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32) weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32)
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t)) self._keys = <attr_t*>self.mem.alloc(items.size(), sizeof(attr_t))
self._map = {} self._map = {}
cdef int i = 0 cdef int i = 0
while not items.empty(): while not items.empty():
@ -94,21 +96,33 @@ cdef class _AttributeCodec:
dest[i] = <attr_t>self._keys[dest[i]] dest[i] = <attr_t>self._keys[dest[i]]
cdef class Packer: def _gen_orths(Vocab vocab):
def __init__(self, Vocab vocab, list_of_attr_freqs): cdef attr_t orth
self.vocab = vocab cdef size_t addr
codecs = [] for orth, addr in vocab._by_orth.items():
attrs = [] lex = <LexemeC*>addr
yield orth, c_exp(lex.prob)
for attr, freqs in list_of_attr_freqs:
if attr == SPACY: cdef class Packer:
codecs.append(_BinaryCodec()) def __init__(self, Vocab vocab, attr_freqs):
else: self.vocab = vocab
self.lex_codec = _AttributeCodec(_gen_orths(vocab))
codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()]
attrs = [ORTH, SPACY]
for attr, freqs in sorted(attr_freqs):
if attr in (ORTH, ID, SPACY):
continue
codecs.append(_AttributeCodec(freqs)) codecs.append(_AttributeCodec(freqs))
attrs.append(attr) attrs.append(attr)
self._codecs = tuple(codecs) self._codecs = tuple(codecs)
self.attrs = tuple(attrs) self.attrs = tuple(attrs)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
return cls(vocab, util.read_encoding_freqs(data_dir))
def pack(self, Doc doc): def pack(self, Doc doc):
array = doc.to_array(self.attrs) array = doc.to_array(self.attrs)
cdef BitArray bits = BitArray() cdef BitArray bits = BitArray()
@ -124,6 +138,4 @@ cdef class Packer:
array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32) array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs): for i, codec in enumerate(self._codecs):
codec.decode(bits, array[:, i]) codec.decode(bits, array[:, i])
doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1]) return array
doc.from_array(self.attrs, array)
return doc

View File

@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from libc.string cimport memcpy from libc.string cimport memcpy
@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem):
label = 'ROOT' label = 'ROOT'
gold.c.heads[i] = gold.heads[i] gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label] gold.c.labels[i] = self.strings[label]
# Count frequencies, for use in encoder
self.freqs[HEAD][gold.c.heads[i] - i] += 1
self.freqs[DEP][gold.c.labels[i]] += 1
for end, brackets in gold.brackets.items(): for end, brackets in gold.brackets.items():
for start, label_strs in brackets.items(): for start, label_strs in brackets.items():
gold.c.brackets[start][end] = 1 gold.c.brackets[start][end] = 1

View File

@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from .stateclass cimport StateClass from .stateclass cimport StateClass
@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem):
cdef int preprocess_gold(self, GoldParse gold) except -1: cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length): for i in range(gold.length):
gold.c.ner[i] = self.lookup_transition(gold.ner[i]) gold.c.ner[i] = self.lookup_transition(gold.ner[i])
# Count frequencies, for use in encoder
if gold.c.ner[i].move in (BEGIN, UNIT):
self.freqs[ENT_IOB][3] += 1
self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1
elif gold.c.ner[i].move in (IN, LAST):
self.freqs[ENT_IOB][2] += 1
self.freqs[ENT_TYPE][0] += 1
elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
if name == '-': if name == '-':

View File

@ -35,6 +35,7 @@ cdef class TransitionSystem:
cdef bint* _is_valid cdef bint* _is_valid
cdef readonly int n_moves cdef readonly int n_moves
cdef public int root_label cdef public int root_label
cdef public freqs
cdef int initialize_state(self, StateClass state) except -1 cdef int initialize_state(self, StateClass state) except -1
cdef int finalize_state(self, StateClass state) nogil cdef int finalize_state(self, StateClass state) nogil

View File

@ -1,8 +1,10 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from ..structs cimport TokenC
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict
from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -28,6 +30,9 @@ cdef class TransitionSystem:
i += 1 i += 1
self.c = moves self.c = moves
self.root_label = self.strings['ROOT'] self.root_label = self.strings['ROOT']
self.freqs = {}
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
cdef int initialize_state(self, StateClass state) except -1: cdef int initialize_state(self, StateClass state) except -1:
pass pass

View File

@ -2,6 +2,7 @@ cimport cython
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
import numpy import numpy
import struct
from ..lexeme cimport EMPTY_LEXEME from ..lexeme cimport EMPTY_LEXEME
from ..strings cimport slice_unicode from ..strings cimport slice_unicode
@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr
from .spans import Span from .spans import Span
from ..structs cimport UniStr from ..structs cimport UniStr
from .token cimport Token from .token cimport Token
from ..serialize.bits cimport BitArray
DEF PADDING = 5 DEF PADDING = 5
@ -54,7 +56,7 @@ cdef class Doc:
Container class for annotated text. Constructed via English.__call__ or Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__. Tokenizer.__call__.
""" """
def __init__(self, Vocab vocab): def __init__(self, Vocab vocab, orths_and_spaces=None):
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
self.mem = Pool() self.mem = Pool()
@ -71,24 +73,17 @@ cdef class Doc:
self.is_tagged = False self.is_tagged = False
self.is_parsed = False self.is_parsed = False
self._py_tokens = [] self._py_tokens = []
@classmethod
def from_ids(cls, Vocab vocab, orths, spaces):
cdef int i
cdef const LexemeC* lex cdef const LexemeC* lex
cdef Doc self = cls(vocab)
cdef bint space = 0
cdef attr_t orth cdef attr_t orth
for i in range(len(orths)): cdef bint space
orth = orths[i] if orths_and_spaces is not None:
for orth, space in orths_and_spaces:
lex = <LexemeC*>self.vocab._by_orth.get(orth) lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL: if lex != NULL:
assert lex.orth == orth assert lex.orth == orth
space = spaces[i]
self.push_back(lex, space) self.push_back(lex, space)
else: else:
raise Exception('Lexeme not found: %d' % orth) raise Exception('Lexeme not found: %d' % orth)
return self
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a token. """Get a token.
@ -389,3 +384,26 @@ cdef class Doc:
elif attr_id == ENT_TYPE: elif attr_id == ENT_TYPE:
for i in range(length): for i in range(length):
tokens[i].ent_type = values[i] tokens[i].ent_type = values[i]
def to_bytes(self):
bits = self.vocab.packer.pack(self)
return struct.pack('I', len(bits)) + bits.as_bytes()
@staticmethod
def from_bytes(Vocab vocab, file_):
keep_reading = True
while keep_reading:
try:
n_bits_str = file_.read(4)
if len(n_bits_str) < 4:
break
n_bits = struct.unpack('I', n_bits_str)[0]
n_bytes = n_bits // 8 + bool(n_bits % 8)
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
bits = BitArray(data)
array = vocab.packer.unpack(bits)
doc = Doc(vocab, array[:, :2])
doc.from_array(vocab.packer.attrs, array)
yield doc

View File

@ -2,6 +2,7 @@ from os import path
import codecs import codecs
import json import json
import re import re
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
DATA_DIR = path.join(path.dirname(__file__), '..', 'data') DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
@ -64,7 +65,17 @@ def read_tokenization(lang):
return entries return entries
def read_detoken_rules(lang): def read_encoding_freqs(data_dir):
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
(ENT_TYPE, ne_types)]
def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize') loc = path.join(DATA_DIR, lang, 'detokenize')
entries = [] entries = []
with utf8open(loc) as file_: with utf8open(loc) as file_:
@ -73,7 +84,7 @@ def read_detoken_rules(lang):
return entries return entries
def align_tokens(ref, indices): def align_tokens(ref, indices): # Deprecated, surely?
start = 0 start = 0
queue = list(indices) queue = list(indices)
for token in ref: for token in ref:
@ -86,7 +97,7 @@ def align_tokens(ref, indices):
assert not queue assert not queue
def detokenize(token_rules, words): def detokenize(token_rules, words): # Deprecated?
"""To align with treebanks, return a list of "chunks", where a chunk is a """To align with treebanks, return a list of "chunks", where a chunk is a
sequence of tokens that are separated by whitespace in actual strings. Each sequence of tokens that are separated by whitespace in actual strings. Each
chunk should be a tuple of token indices, e.g. chunk should be a tuple of token indices, e.g.

View File

@ -29,6 +29,7 @@ cdef class Vocab:
cpdef readonly StringStore strings cpdef readonly StringStore strings
cdef readonly object pos_tags cdef readonly object pos_tags
cdef readonly int length cdef readonly int length
cdef public object packer
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1

View File

@ -16,6 +16,8 @@ from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from cymem.cymem cimport Address from cymem.cymem cimport Address
from . import util
from .serialize.packer cimport Packer
DEF MAX_VEC_SIZE = 100000 DEF MAX_VEC_SIZE = 100000
@ -53,6 +55,8 @@ cdef class Vocab:
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
self.packer = Packer(self, util.read_encoding_freqs(data_dir))
def __len__(self): def __len__(self):
"""The current number of lexemes stored.""" """The current number of lexemes stored."""
return self.length return self.length

View File

@ -5,7 +5,6 @@ import numpy
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.serialize.packer import _BinaryCodec from spacy.serialize.packer import _BinaryCodec
from spacy.serialize.packer import make_vocab_codec
from spacy.serialize.packer import _AttributeCodec from spacy.serialize.packer import _AttributeCodec
from spacy.serialize.bits import BitArray from spacy.serialize.bits import BitArray