* Tmp

2025-07-15 18:52:29 +03:00 · 2015-03-09 01:46:22 -04:00 · 2015-03-09 01:46:22 -04:00 · b3eda03c9c
commit b3eda03c9c
parent 220ce8bfed
9 changed files with 336 additions and 240 deletions
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -42,9 +42,17 @@ cdef struct PosTag:
    univ_pos_t pos
 cdef struct Entity:
    int start
    int end
    int tag
    int label
 cdef struct TokenC:
    const LexemeC* lex
    Morphology morph
    Entity ent
    univ_pos_t pos
    int tag
    int idx
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -2,15 +2,17 @@ from libc.stdint cimport uint32_t
 from cymem.cymem cimport Pool
-from ..structs cimport TokenC
+from ..structs cimport TokenC, Entity
 cdef struct State:
    TokenC* sent
    int* stack
    Entity* ent
    int i
    int sent_len
    int stack_len
    int ents_len
 cdef int add_dep(const State *s, const int head, const int child, const int label) except -1
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -35,16 +35,16 @@ cdef get_cost_func_t[N_MOVES] get_cost_funcs
 cdef class ArcEager(TransitionSystem):
    @classmethod
    def get_labels(cls, gold_parses):
-        labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {},
+        move_labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {},
-                  LEFT: {}, BREAK: {'ROOT': True}}
+                       LEFT: {}, BREAK: {'ROOT': True}}
-        for parse in gold_parses:
+        for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
-            for i, (head, label) in enumerate(zip(parse.heads, parse.labels)):
+            for i, (head, label) in enumerate(zip(heads, labels)):
                if label != 'ROOT':
                    if head > i:
-                        labels[RIGHT][label] = True
+                        move_labels[RIGHT][label] = True
                    elif head < i:
-                        labels[LEFT][label] = True
+                        move_labels[LEFT][label] = True
-        return labels
+        return move_labels
    cdef Transition init_transition(self, int clas, int move, int label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
--- a/spacy/syntax/conll.pxd
+++ b/spacy/syntax/conll.pxd
@ -1,22 +1,33 @@
 from cymem.cymem cimport Pool
 from ..structs cimport TokenC
 from .transition_system cimport Transition
 cdef class GoldParse:
    cdef Pool mem
    cdef int length
    cdef readonly int loss
    cdef readonly object ids
    cdef readonly object tags
    cdef readonly object heads
    cdef readonly object labels
    cdef readonly object tags_
    cdef readonly object labels_
    cdef readonly object ner_
    cdef Transition* ner
    cdef int* c_heads
    cdef int* c_labels
    cdef int length
    cdef int loss
    cdef readonly unicode raw_text
    cdef readonly list words
    cdef readonly list ids
    cdef readonly list tags
    cdef readonly list heads
    cdef readonly list labels
    cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
 cdef class NERAnnotation:
    cdef Pool mem
    cdef int* starts
    cdef int* ends
    cdef int* labels
    cdef readonly list entities
--- a/spacy/syntax/conll.pyx
+++ b/spacy/syntax/conll.pyx
@ -1,67 +1,24 @@
-cdef class GoldParse:
+import numpy
-    def __init__(self, raw_text, words, ids, tags, heads, labels):
+import codecs
-        self.mem = Pool()
+from .ner_util import iob_to_biluo
        self.loss = 0
        self.length = len(words)
        self.raw_text = raw_text
        self.words = words
        self.ids = ids
        self.tags = tags
        self.heads = heads
        self.labels = labels
        self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int))
        self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
-    @property
+from libc.string cimport memset
    def n_non_punct(self):
        return len([l for l in self.labels if l != 'P'])
    @property
    def py_heads(self):
        return [self.c_heads[i] for i in range(self.length)]
-    cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
+def read_docparse_file(loc):
-        n = 0
+    sents = []
-        for i in range(self.length):
+    for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'):
            if not score_punct and self.labels[i] == 'P':
                continue
            n += (i + tokens[i].head) == self.c_heads[i]
        return n
    def is_correct(self, i, head):
        return head == self.c_heads[i]
    @classmethod
    def from_conll(cls, unicode sent_str):
        ids = []
        words = []
        heads = []
        labels = []
        tags = []
        for i, line in enumerate(sent_str.split('\n')):
            id_, word, pos_string, head_idx, label = _parse_line(line)
            words.append(word)
            if head_idx == -1:
                head_idx = i
            ids.append(id_)
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
        text = ' '.join(words)
        return cls(text, [words], ids, words, tags, heads, labels)
    @classmethod
    def from_docparse(cls, unicode sent_str):
        words = []
        heads = []
        labels = []
        tags = []
        ids = []
        iob_ents = []
        lines = sent_str.strip().split('\n')
        raw_text = lines.pop(0).strip()
        tok_text = lines.pop(0).strip()
        for i, line in enumerate(lines):
-            id_, word, pos_string, head_idx, label = _parse_line(line)
+            id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line)
            if label == 'root':
                label = 'ROOT'
            words.append(word)
@ -71,57 +28,78 @@ cdef class GoldParse:
            heads.append(head_idx)
            labels.append(label)
            tags.append(pos_string)
-        tokenized = [sent_str.replace('<SEP>', ' ').split(' ')
+            iob_ents.append(iob_ent)
-                     for sent_str in tok_text.split('<SENT>')]
+        tokenized = [s.replace('<SEP>', ' ').split(' ')
-        return cls(raw_text, words, ids, tags, heads, labels)
+                     for s in tok_text.split('<SENT>')]
        sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
    return sents
-    def align_to_tokens(self, tokens, label_ids):
+
-        orig_words = list(self.words)
+cdef class GoldParse:
-        annot = zip(self.ids, self.tags, self.heads, self.labels)
+    def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types):
-        self.ids = []
+        self.mem = Pool()
-        self.tags = []
+        self.loss = 0
        self.heads = []
        self.labels = []
        missed = []
        for token in tokens:
            while annot and token.idx > annot[0][0]:
                miss_id, miss_tag, miss_head, miss_label = annot.pop(0)
                if not is_punct_label(miss_label):
                    self.loss += 1
            if not annot:
                self.tags.append(None)
                self.heads.append(None)
                self.labels.append(None)
                continue
            id_, tag, head, label = annot[0]
            if token.idx == id_:
                self.tags.append(tag)
                self.heads.append(head)
                self.labels.append(label)
                annot.pop(0)
            elif token.idx < id_:
                self.tags.append(None)
                self.heads.append(None)
                self.labels.append(None)
            else:
                raise StandardError
        self.length = len(tokens)
-        self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int))
+        self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
-        self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
+        self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
-        self.ids = [token.idx for token in tokens]
+        self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
-        self.map_heads(label_ids)
+        self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32)
        return self.loss
-    def map_heads(self, label_ids):
+        self.ids[:] = -1
-        mapped_heads = _map_indices_to_tokens(self.ids, self.heads)
+        self.tags[:] = -1
-        for i in range(self.length):
+        self.heads[:] = -1
-            if mapped_heads[i] is None:
+        self.labels[:] = -1
        self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))
        self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
        self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
        for i in range(len(tokens)):
            self.c_heads[i] = -1
            self.c_labels[i] = -1
        self.tags_ = [None] * len(tokens)
        self.labels_ = [None] * len(tokens)
        self.ner_ = [None] * len(tokens)
        idx_map = {token.idx: token.i for token in tokens}
        print idx_map
        # TODO: Fill NER moves
        print raw_text
        for idx, tag, head, label, ner in zip(*annot_tuples):
            if idx < tokens[0].idx:
                pass
            elif idx > tokens[-1].idx:
                break
            elif idx in idx_map:
                i = idx_map[idx]
                print i, idx, head, idx_map.get(head, -1)
                self.ids[i] = idx
                self.tags[i] = pos_tags.index(tag)
                self.heads[i] = idx_map.get(head, -1)
                self.labels[i] = dep_labels[label]
                self.c_heads[i] = -1
                self.c_labels[i] = -1
-            else:
+                self.tags_[i] = tag
-                self.c_heads[i] = mapped_heads[i]
+                self.labels_[i] = label
-                self.c_labels[i] = label_ids[self.labels[i]]
+                self.ner_[i] = ner
-        return self.loss
+
    @property
    def n_non_punct(self):
        return len([l for l in self.labels if l != 'P'])
    cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
        n = 0
        for i in range(self.length):
            if not score_punct and self.labels_[i] == 'P':
                continue
            if self.heads[i] == -1:
                continue
            n += (i + tokens[i].head) == self.heads[i]
        return n
    def is_correct(self, i, head):
        return head == self.c_heads[i]
 def is_punct_label(label):
@ -146,6 +124,63 @@ def _parse_line(line):
        id_ = int(pieces[0])
        word = pieces[1]
        pos = pieces[3]
        iob_ent = pieces[5]
        head_idx = int(pieces[6])
        label = pieces[7]
-        return id_, word, pos, head_idx, label
+        return id_, word, pos, head_idx, label, iob_ent
 cdef class NERAnnotation:
    def __init__(self, entities, length, entity_types):
        self.mem = Pool()
        self.starts = <int*>self.mem.alloc(length, sizeof(int))
        self.ends = <int*>self.mem.alloc(length, sizeof(int))
        self.labels = <int*>self.mem.alloc(length, sizeof(int))
        self.entities = entities
        memset(self.starts, -1, sizeof(int) * length)
        memset(self.ends, -1, sizeof(int) * length)
        memset(self.labels, -1, sizeof(int) * length)
        cdef int start, end, label
        for start, end, label in entities:
            for i in range(start, end):
                self.starts[i] = start
                self.ends[i] = end
                self.labels[i] = label
    @property
    def biluo_tags(self):
        pass
    @property
    def iob_tags(self):
        pass
    @classmethod
    def from_iobs(cls, iob_strs, entity_types):
        return cls.from_biluos(iob_to_biluo(iob_strs), entity_types)
    @classmethod
    def from_biluos(cls, tag_strs, entity_types):
        entities = []
        start = None
        for i, tag_str in enumerate(tag_strs):
            if tag_str == 'O' or tag_str == '-':
                continue
            move, label_str = tag_str.split('-')
            label = entity_types.index(label_str)
            if label == -1:
                label = len(entity_types)
                entity_types.append(label)
            if move == 'U':
                assert start is None
                entities.append((i, i+1, label))
            elif move == 'B':
                assert start is None
                start = i
            elif move == 'L':
                assert start is not None
                entities.append((start, i+1, label))
                start = None
        return cls(entities, len(tag_strs), entity_types)
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@ -1,28 +1,7 @@
-from cymem.cymem cimport Pool
+from .transition_system cimport TransitionSystem
-
+from .transition_system cimport Transition
-from thinc.typedefs cimport weight_t
+from ._state cimport State
-from ._state cimport State 
+cdef class BiluoPushDown(TransitionSystem):
-
+    pass
 cdef struct Transition:
    int clas
    int move
    int label
    int cost
    weight_t score
 cdef class TransitionSystem:
    cdef Pool mem
    cdef readonly int n_moves
    cdef dict label_ids
    cdef const Transition* _moves
    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *
    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
                              const State* s,
                              const int* gold_heads, const int* gold_labels) except *
    cdef int transition(self, State *s, const Transition* t) except -1
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -1,16 +1,15 @@
 from __future__ import unicode_literals
 from ._state cimport State
 from ._state cimport has_head, get_idx, get_s0, get_n0
 from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
 from ._state cimport head_in_buffer, children_in_buffer
 from ._state cimport head_in_stack, children_in_stack
-from ..structs cimport TokenC
+from .transition_system cimport Transition
 from .transition_system cimport do_func_t
 from ..structs cimport TokenC, Entity
-DEF NON_MONOTONIC = True
+from thinc.typedefs cimport weight_t
-DEF USE_BREAK = True
+from .conll cimport GoldParse
 from .ner_util import iob_to_biluo
 cdef enum:
@ -23,13 +22,34 @@ cdef enum:
    N_MOVES
-cdef int is_valid(ActionType act, int label, State* s) except -1:
+cdef do_func_t[N_MOVES] do_funcs
 cdef bint entity_is_open(const State *s) except -1:
    return s.sent[s.i - 1].ent.tag >= 1
 cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
    if not entity_is_open(s):
        return False
    cdef const Entity* curr = &s.sent[s.i - 1].ent
    cdef const Transition* gold = &golds[(s.i - 1) + curr.start]
    if gold.move != BEGIN and gold.move != UNIT:
        return True
    elif gold.label != s.ent.label:
        return True
    else:
        return False
 cdef int _is_valid(int act, int label, const State* s) except -1:
    if act == BEGIN:
        return not entity_is_open(s)
    elif act == IN:
-        return entity_is_open(s) and s.curr.label == label
+        return entity_is_open(s) and s.ent.label == label
    elif act == LAST:
-        return entity_is_open(s) and s.curr.label == label
+        return entity_is_open(s) and s.ent.label == label
    elif act == UNIT:
        return not entity_is_open(s)
    elif act == OUT:
@ -38,8 +58,56 @@ cdef int is_valid(ActionType act, int label, State* s) except -1:
        raise UnknownMove(act, label)
-cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag,
+cdef class BiluoPushDown(TransitionSystem):
-                 ActionType next_act, bint is_sunk):
+    @classmethod
    def get_labels(cls, gold_tuples):
        move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}}
        moves = ('-', 'B', 'I', 'L', 'U')
        for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples:
            for i, ner_tag in enumerate(iob_to_biluo(iob)):
                if ner_tag != 'O' and ner_tag != '-':
                    move_str, label = ner_tag.split('-')
                    move_labels[moves.index(move_str)][label] = True
        return move_labels
    cdef Transition init_transition(self, int clas, int move, int label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
        cdef Transition t
        t.score = 0
        t.clas = clas
        t.move = move
        t.label = label
        t.do = do_funcs[move]
        t.get_cost = _get_cost
        return t
    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
        cdef int best = -1
        cdef weight_t score = -90000
        cdef const Transition* m
        cdef int i
        for i in range(self.n_moves):
            m = &self.c[i]
            if _is_valid(m.move, m.label, s) and scores[i] > score:
                best = i
                score = scores[i]
        assert best >= 0
        cdef Transition t = self.c[best]
        t.score = score
        return t
 cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1:
    if not _is_valid(self.move, self.label, s):
        return 9000
    cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
    cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT
    return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label,
                        next_act, is_sunk)
 cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
                   int next_act, bint is_sunk):
    if g_act == MISSING:
        return True
    if act == BEGIN:
@ -112,98 +180,46 @@ cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag,
            return False
-cdef bint entity_is_open(State *s) except -1:
+cdef int _do_begin(const Transition* self, State* s) except -1:
-    return s.sent[s.i - 1].ent.tag >= 1
+    s.ent += 1
    s.ents_len += 1
    s.ent.start = s.i
    s.ent.label = self.label
    s.sent[s.i].ent.tag = self.clas 
    s.i += 1
-cdef bint entity_is_sunk(State *s, Move* golds) except -1:
+cdef int _do_in(const Transition* self, State* s) except -1:
-    if not entity_is_open(s):
+    s.sent[s.i].ent.tag = self.clas 
-        return False
+    s.i += 1
    cdef const Entity* curr = &s.sent[s.i - 1].ent
    cdef Move* gold = &golds[(s.i - 1) + curr.start]
    if gold.action != BEGIN and gold.action != UNIT:
        return True
    elif gold.label != s.curr.label:
        return True
    else:
        return False
-cdef class TransitionSystem:
+cdef int _do_last(const Transition* self, State* s) except -1:
-    def __init__(self, list entity_type_strs):
+    s.ent.end = s.i+1
-        self.mem = Pool()
+    s.sent[s.i].ent.tag = self.clas 
    s.i += 1
        cdef Move* m
        label_names = {'-': 0}
        for i, tag_name in enumerate(tag_names):
            m = &moves[i]
            if '-' in tag_name:
                action_str, label = tag_name.split('-')
            elif tag_name == 'O':
                action_str = 'O'
                label = '-'
            elif tag_name == 'NULL' or tag_name == 'EOL':
                action_str = '?'
                label = '-'
            else:
                raise StandardError(tag_name)
            m.action = ACTION_NAMES.index(action_str)
            m.label = label_names.setdefault(label, len(label_names))
            m.clas = i
-    cdef int transition(self, State *s, Move* move) except -1:
+cdef int _do_unit(const Transition* self, State* s) except -1:
-        if move.action == BEGIN:
+    s.ent += 1
-            s.curr.start = s.i
+    s.ents_len += 1
-            s.curr.label = label
+    s.ent.start = s.i
-        elif move.action == IN:
+    s.ent.label = self.label
-            pass
+    s.ent.end = s.i+1
-        elif move.action == LAST:
+    s.sent[s.i].ent.tag = self.clas 
-            s.curr.end = s.i
+    s.i += 1
            s.ents[s.j] = s.curr
            s.j += 1
            s.curr.start = 0
            s.curr.label = -1
            s.curr.end = 0
        elif move.action == UNIT:
            begin_entity(s, move.label)
            end_entity(s)
        elif move.action == OUT:
            pass
        s.tags[s.i] = move.clas 
        s.i += 1
    cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
        cdef int best = -1
        cdef weight_t score = -90000
        cdef const Transition* m
        cdef int i
        for i in range(self.n_moves):
            m = &self._moves[i]
            if _is_valid(s, m.ent_move, m.ent_label) and scores[i] > score:
                best = i
                score = scores[i]
        assert best >= 0
        cdef Transition t = self._moves[best]
        t.score = score
        return t
-    cdef Transition best_gold(self, Transition* guess, const weight_t* scores,
+cdef int _do_out(const Transition* self, State* s) except -1:
-                              const State* s, Move* golds) except *:
+    s.sent[s.i].ent.tag = self.clas 
-        cdef Move* g = &golds[s.i]
+    s.i += 1
-        cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT
+
-        cdef bint is_sunk = entity_is_sunk(s, golds)
+
-        cdef Move* m
+do_funcs[BEGIN] = _do_begin
-        cdef int n_accept = 0
+do_funcs[IN] = _do_in
-        for i in range(1, self.n_classes):
+do_funcs[LAST] = _do_last
-            m = &moves[i]
+do_funcs[UNIT] = _do_unit
-            if _is_valid(s, m.move, m.label) and \
+do_funcs[OUT] = _do_out
               _is_gold(s, m.move, m.label, next_act, is_sunk) and \
               scores[i] > score:
                best = i
                score = scores[i]
        assert best >= 0
        return self._moves[best]
 class OracleError(Exception):
@ -212,3 +228,5 @@ class OracleError(Exception):
 class UnknownMove(Exception):
    pass
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -35,3 +35,10 @@ cdef class TransitionSystem:
    cdef Transition best_gold(self, const weight_t* scores, const State* state,
                              GoldParse gold) except *
 #cdef class PyState:
 #    """Provide a Python class for testing purposes."""
 #    cdef Pool mem
 #    cdef TransitionSystem system
 #    cdef State* _state
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -45,3 +45,39 @@ cdef class TransitionSystem:
                score = scores[i]
        assert score > MIN_SCORE
        return best
 #cdef class PyState:
 #    """Provide a Python class for testing purposes."""
 #    def __init__(self, GoldParse gold):
 #        self.mem = Pool()
 #        self.system = EntityRecognition(labels)
 #        self._state = init_state(self.mem, tokens, gold.length)
 #
 #    def transition(self, name):
 #        cdef const Transition* trans = self._transition_by_name(name)
 #        trans.do(trans, self._state)
 #
 #    def is_valid(self, name):
 #        cdef const Transition* trans = self._transition_by_name(name)
 #        return _is_valid(trans.move, trans.label, self._state)
 #
 #    def is_gold(self, name):
 #        cdef const Transition* trans = self._transition_by_name(name)
 #        return _get_const(trans, self._state, self._gold)
 #
 #    property ent:
 #        def __get__(self):
 #            pass
 #
 #    property n_ents:
 #        def __get__(self):
 #            pass
 #
 #    property i:
 #        def __get__(self):
 #            pass
 #
 #    property open_entity:
 #        def __get__(self):
 #            return entity_is_open(self._s)