From 317cbbc015662c96b18bc7446ac7480897eaec7d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 19 Jul 2015 15:18:17 +0200 Subject: [PATCH] * Serialization round trip now working with decent API, but with rough spots in the organisation and requiring vocabulary to be fixed ahead of time. --- bin/parser/train.py | 6 +--- spacy/en/__init__.py | 30 ++++++++++++++++- spacy/en/pos.pxd | 2 ++ spacy/en/pos.pyx | 5 ++- spacy/serialize/bits.pyx | 4 +-- spacy/serialize/packer.pxd | 1 + spacy/serialize/packer.pyx | 44 ++++++++++++++++--------- spacy/syntax/arc_eager.pyx | 4 +++ spacy/syntax/ner.pyx | 11 +++++++ spacy/syntax/transition_system.pxd | 1 + spacy/syntax/transition_system.pyx | 7 +++- spacy/tokens/doc.pyx | 52 ++++++++++++++++++++---------- spacy/util.py | 17 ++++++++-- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 4 +++ tests/serialize/test_codecs.py | 1 - 16 files changed, 143 insertions(+), 47 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index a3903848b..d706f7747 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -141,11 +141,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc) - nlp.parser.model.end_training() - nlp.entity.model.end_training() - nlp.tagger.model.end_training() - nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) - + nlp.end_training() def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 2ee5e4d84..240efd54e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,6 +1,8 @@ from __future__ import unicode_literals from os import path import re +import struct +import json from .. import orth from ..vocab import Vocab @@ -8,6 +10,7 @@ from ..tokenizer import Tokenizer from ..syntax.arc_eager import ArcEager from ..syntax.ner import BiluoPushDown from ..syntax.parser import ParserFactory +from ..serialize.bits import BitArray from ..tokens import Doc from ..multi_words import RegexMerger @@ -19,6 +22,8 @@ from . import regexes from ..util import read_lang_data +from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB + def get_lex_props(string): return { @@ -74,7 +79,7 @@ class English(object): load_vectors=True ): - self._data_dir = data_dir + self.data_dir = data_dir self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, get_lex_props=get_lex_props, load_vectors=load_vectors, @@ -140,6 +145,29 @@ class English(object): self.mwe_merger(tokens) return tokens + def end_training(self, data_dir=None): + if data_dir is None: + data_dir = self.data_dir + self.parser.model.end_training() + self.entity.model.end_training() + self.tagger.model.end_training() + self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + with open(path.join(data_dir, 'pos', 'tag_freqs.json'), 'w') as file_: + json.dump(list(self.tagger.freqs[TAG].items()), file_) + + with open(path.join(data_dir, 'deps', 'head_freqs.json'), 'w') as file_: + json.dump(list(self.parser.moves.freqs[HEAD].items()), file_) + + with open(path.join(data_dir, 'deps', 'dep_freqs.json'), 'w') as file_: + json.dump(list(self.parser.moves.freqs[DEP].items()), file_) + + with open(path.join(data_dir, 'ner', 'iob_freqs.json'), 'w') as file_: + json.dump(list(self.entity.moves.freqs[ENT_IOB].items()), file_) + + with open(path.join(data_dir, 'ner', 'ne_freqs.json'), 'w') as file_: + json.dump(list(self.entity.moves.freqs[ENT_TYPE].items()), file_) + @property def tags(self): """List of part-of-speech tag names.""" diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index b59481020..2fc7b4ac7 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,4 +1,5 @@ from preshed.maps cimport PreshMapArray +from preshed.counter cimport PreshCounter from cymem.cymem cimport Pool from .._ml cimport Model @@ -14,6 +15,7 @@ cdef class EnPosTagger: cdef readonly Model model cdef public object lemmatizer cdef PreshMapArray _morph_cache + cdef public dict freqs cdef PosTag* tags cdef readonly object tag_names diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index db1679c28..3dab084a8 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -7,6 +7,7 @@ from libc.string cimport memset from cymem.cymem cimport Address from thinc.typedefs cimport atom_t, weight_t +from collections import defaultdict from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON @@ -17,7 +18,7 @@ from ..tokens.doc cimport Doc from ..morphology cimport set_morph_from_dict from .._ml cimport arg_max -from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL +from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL from ..typedefs cimport attr_t from .lemmatizer import Lemmatizer @@ -260,6 +261,7 @@ cdef class EnPosTagger: self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', 'morphs.json')))) self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) + self.freqs = {TAG: defaultdict(int)} def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -309,6 +311,7 @@ cdef class EnPosTagger: tokens.data[i].tag = self.strings[self.tag_names[guess]] self.set_morph(i, &self.tags[guess], tokens.data) correct += loss == 0 + self.freqs[TAG][tokens.data[i].tag] += 1 return correct cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx index 340b18cd5..3b879b2ee 100644 --- a/spacy/serialize/bits.pyx +++ b/spacy/serialize/bits.pyx @@ -14,8 +14,8 @@ cdef Code bit_append(Code code, bint bit) nogil: cdef class BitArray: - def __init__(self): - self.data = b'' + def __init__(self, data=b''): + self.data = data self.byte = 0 self.bit_of_byte = 0 self.i = 0 diff --git a/spacy/serialize/packer.pxd b/spacy/serialize/packer.pxd index 02bcdf56e..d8bc96553 100644 --- a/spacy/serialize/packer.pxd +++ b/spacy/serialize/packer.pxd @@ -4,4 +4,5 @@ from ..vocab cimport Vocab cdef class Packer: cdef readonly tuple attrs cdef readonly tuple _codecs + cdef readonly object lex_codec cdef readonly Vocab vocab diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index 09f6de57a..8c7adb1d8 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -8,15 +8,17 @@ from libcpp.pair cimport pair from cymem.cymem cimport Address, Pool from preshed.maps cimport PreshMap -from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..tokens.doc cimport Doc from ..vocab cimport Vocab +from ..structs cimport LexemeC from ..typedefs cimport attr_t from .bits cimport BitArray from .huffman cimport HuffmanCodec from os import path import numpy +from .. import util cimport cython @@ -67,8 +69,8 @@ cdef class _AttributeCodec: item.first = count item.second = key items.push(item) - weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32) - self._keys = self.mem.alloc(len(freqs), sizeof(attr_t)) + weights = numpy.ndarray(shape=(items.size(),), dtype=numpy.float32) + self._keys = self.mem.alloc(items.size(), sizeof(attr_t)) self._map = {} cdef int i = 0 while not items.empty(): @@ -94,21 +96,33 @@ cdef class _AttributeCodec: dest[i] = self._keys[dest[i]] -cdef class Packer: - def __init__(self, Vocab vocab, list_of_attr_freqs): - self.vocab = vocab - codecs = [] - attrs = [] +def _gen_orths(Vocab vocab): + cdef attr_t orth + cdef size_t addr + for orth, addr in vocab._by_orth.items(): + lex = addr + yield orth, c_exp(lex.prob) - for attr, freqs in list_of_attr_freqs: - if attr == SPACY: - codecs.append(_BinaryCodec()) - else: - codecs.append(_AttributeCodec(freqs)) + +cdef class Packer: + def __init__(self, Vocab vocab, attr_freqs): + self.vocab = vocab + self.lex_codec = _AttributeCodec(_gen_orths(vocab)) + + codecs = [_AttributeCodec(_gen_orths(vocab)), _BinaryCodec()] + attrs = [ORTH, SPACY] + for attr, freqs in sorted(attr_freqs): + if attr in (ORTH, ID, SPACY): + continue + codecs.append(_AttributeCodec(freqs)) attrs.append(attr) self._codecs = tuple(codecs) self.attrs = tuple(attrs) + @classmethod + def from_dir(cls, Vocab vocab, data_dir): + return cls(vocab, util.read_encoding_freqs(data_dir)) + def pack(self, Doc doc): array = doc.to_array(self.attrs) cdef BitArray bits = BitArray() @@ -124,6 +138,4 @@ cdef class Packer: array = numpy.zeros(shape=(length, len(self._codecs)), dtype=numpy.int32) for i, codec in enumerate(self._codecs): codec.decode(bits, array[:, i]) - doc = Doc.from_ids(self.vocab, array[:, 0], array[:, 1]) - doc.from_array(self.attrs, array) - return doc + return array diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index ebd8a0cac..0808fabf8 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -10,6 +10,7 @@ from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC +from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from libc.stdint cimport uint32_t from libc.string cimport memcpy @@ -309,6 +310,9 @@ cdef class ArcEager(TransitionSystem): label = 'ROOT' gold.c.heads[i] = gold.heads[i] gold.c.labels[i] = self.strings[label] + # Count frequencies, for use in encoder + self.freqs[HEAD][gold.c.heads[i] - i] += 1 + self.freqs[DEP][gold.c.labels[i]] += 1 for end, brackets in gold.brackets.items(): for start, label_strs in brackets.items(): gold.c.brackets[start][end] = 1 diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index b145df7ac..8f6a662e8 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -8,6 +8,7 @@ from ..structs cimport TokenC, Entity from thinc.typedefs cimport weight_t from ..gold cimport GoldParseC from ..gold cimport GoldParse +from ..attrs cimport ENT_TYPE, ENT_IOB from .stateclass cimport StateClass @@ -74,6 +75,16 @@ cdef class BiluoPushDown(TransitionSystem): cdef int preprocess_gold(self, GoldParse gold) except -1: for i in range(gold.length): gold.c.ner[i] = self.lookup_transition(gold.ner[i]) + # Count frequencies, for use in encoder + if gold.c.ner[i].move in (BEGIN, UNIT): + self.freqs[ENT_IOB][3] += 1 + self.freqs[ENT_TYPE][gold.c.ner[i].label] += 1 + elif gold.c.ner[i].move in (IN, LAST): + self.freqs[ENT_IOB][2] += 1 + self.freqs[ENT_TYPE][0] += 1 + elif gold.c.ner[i].move == OUT: + self.freqs[ENT_IOB][1] += 1 + self.freqs[ENT_TYPE][0] += 1 cdef Transition lookup_transition(self, object name) except *: if name == '-': diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 2114df410..387cd0fc9 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -35,6 +35,7 @@ cdef class TransitionSystem: cdef bint* _is_valid cdef readonly int n_moves cdef public int root_label + cdef public freqs cdef int initialize_state(self, StateClass state) except -1 cdef int finalize_state(self, StateClass state) nogil diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 083a4990b..4d32a4e54 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,8 +1,10 @@ from cymem.cymem cimport Pool -from ..structs cimport TokenC from thinc.typedefs cimport weight_t +from collections import defaultdict +from ..structs cimport TokenC from .stateclass cimport StateClass +from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB cdef weight_t MIN_SCORE = -90000 @@ -28,6 +30,9 @@ cdef class TransitionSystem: i += 1 self.c = moves self.root_label = self.strings['ROOT'] + self.freqs = {} + for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB): + self.freqs[attr] = defaultdict(int) cdef int initialize_state(self, StateClass state) except -1: pass diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2bf6cf519..3f84b9561 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -2,6 +2,7 @@ cimport cython from libc.string cimport memcpy, memset import numpy +import struct from ..lexeme cimport EMPTY_LEXEME from ..strings cimport slice_unicode @@ -16,6 +17,7 @@ from ..lexeme cimport get_attr as get_lex_attr from .spans import Span from ..structs cimport UniStr from .token cimport Token +from ..serialize.bits cimport BitArray DEF PADDING = 5 @@ -54,7 +56,7 @@ cdef class Doc: Container class for annotated text. Constructed via English.__call__ or Tokenizer.__call__. """ - def __init__(self, Vocab vocab): + def __init__(self, Vocab vocab, orths_and_spaces=None): self.vocab = vocab size = 20 self.mem = Pool() @@ -71,24 +73,17 @@ cdef class Doc: self.is_tagged = False self.is_parsed = False self._py_tokens = [] - - @classmethod - def from_ids(cls, Vocab vocab, orths, spaces): - cdef int i cdef const LexemeC* lex - cdef Doc self = cls(vocab) - cdef bint space = 0 cdef attr_t orth - for i in range(len(orths)): - orth = orths[i] - lex = self.vocab._by_orth.get(orth) - if lex != NULL: - assert lex.orth == orth - space = spaces[i] - self.push_back(lex, space) - else: - raise Exception('Lexeme not found: %d' % orth) - return self + cdef bint space + if orths_and_spaces is not None: + for orth, space in orths_and_spaces: + lex = self.vocab._by_orth.get(orth) + if lex != NULL: + assert lex.orth == orth + self.push_back(lex, space) + else: + raise Exception('Lexeme not found: %d' % orth) def __getitem__(self, object i): """Get a token. @@ -389,3 +384,26 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] + + def to_bytes(self): + bits = self.vocab.packer.pack(self) + return struct.pack('I', len(bits)) + bits.as_bytes() + + @staticmethod + def from_bytes(Vocab vocab, file_): + keep_reading = True + while keep_reading: + try: + n_bits_str = file_.read(4) + if len(n_bits_str) < 4: + break + n_bits = struct.unpack('I', n_bits_str)[0] + n_bytes = n_bits // 8 + bool(n_bits % 8) + data = file_.read(n_bytes) + except StopIteration: + keep_reading = False + bits = BitArray(data) + array = vocab.packer.unpack(bits) + doc = Doc(vocab, array[:, :2]) + doc.from_array(vocab.packer.attrs, array) + yield doc diff --git a/spacy/util.py b/spacy/util.py index 1d48ab7e9..543479f20 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,6 +2,7 @@ from os import path import codecs import json import re +from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE DATA_DIR = path.join(path.dirname(__file__), '..', 'data') @@ -64,7 +65,17 @@ def read_tokenization(lang): return entries -def read_detoken_rules(lang): +def read_encoding_freqs(data_dir): + tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json'))) + heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json'))) + deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json'))) + iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json'))) + ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json'))) + return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob), + (ENT_TYPE, ne_types)] + + +def read_detoken_rules(lang): # Deprecated? loc = path.join(DATA_DIR, lang, 'detokenize') entries = [] with utf8open(loc) as file_: @@ -73,7 +84,7 @@ def read_detoken_rules(lang): return entries -def align_tokens(ref, indices): +def align_tokens(ref, indices): # Deprecated, surely? start = 0 queue = list(indices) for token in ref: @@ -86,7 +97,7 @@ def align_tokens(ref, indices): assert not queue -def detokenize(token_rules, words): +def detokenize(token_rules, words): # Deprecated? """To align with treebanks, return a list of "chunks", where a chunk is a sequence of tokens that are separated by whitespace in actual strings. Each chunk should be a tuple of token indices, e.g. diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 9bf9c32b0..f36e415ad 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -29,6 +29,7 @@ cdef class Vocab: cpdef readonly StringStore strings cdef readonly object pos_tags cdef readonly int length + cdef public object packer cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b8b4b84a8..4fb1d1645 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,6 +16,8 @@ from .orth cimport word_shape from .typedefs cimport attr_t from cymem.cymem cimport Address +from . import util +from .serialize.packer cimport Packer DEF MAX_VEC_SIZE = 100000 @@ -53,6 +55,8 @@ cdef class Vocab: if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) + self.packer = Packer(self, util.read_encoding_freqs(data_dir)) + def __len__(self): """The current number of lexemes stored.""" return self.length diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py index b1d1f99f3..40d56669f 100644 --- a/tests/serialize/test_codecs.py +++ b/tests/serialize/test_codecs.py @@ -5,7 +5,6 @@ import numpy from spacy.vocab import Vocab from spacy.serialize.packer import _BinaryCodec -from spacy.serialize.packer import make_vocab_codec from spacy.serialize.packer import _AttributeCodec from spacy.serialize.bits import BitArray