spaCy/spacy/gold.pyx

from __future__ import unicode_literals, print_function

import numpy
import io
import json
import random
import re
import os
from os import path

from libc.string cimport memset

import ujson as json

from .syntax import nonproj


def tags_to_entities(tags):
    entities = []
    start = None
    for i, tag in enumerate(tags):
        if tag.startswith('O'):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            continue
        elif tag == '-':
            continue
        elif tag.startswith('I'):
            assert start is not None, tags[:i]
            continue
        if tag.startswith('U'):
            entities.append((tag[2:], i, i))
        elif tag.startswith('B'):
            start = i
        elif tag.startswith('L'):
            entities.append((tag[2:], start, i))
            start = None
        else:
            raise Exception(tag)
    return entities


def merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), brackets in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]


def align(cand_words, gold_words):
    cost, edit_path = _min_edit_path(cand_words, gold_words)
    alignment = []
    i_of_gold = 0
    for move in edit_path:
        if move == 'M':
            alignment.append(i_of_gold)
            i_of_gold += 1
        elif move == 'S':
            alignment.append(None)
            i_of_gold += 1
        elif move == 'D':
            alignment.append(None)
        elif move == 'I':
            i_of_gold += 1
        else:
            raise Exception(move)
    return alignment


punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words):
    cdef:
        Pool mem
        int i, j, n_cand, n_gold
        int* curr_costs
        int* prev_costs

    # TODO: Fix this --- just do it properly, make the full edit matrix and
    # then walk back over it...
    # Preprocess inputs
    cand_words = [punct_re.sub('', w) for w in cand_words] 
    gold_words = [punct_re.sub('', w) for w in gold_words] 
    
    if cand_words == gold_words:
        return 0, ''.join(['M' for _ in gold_words])
    mem = Pool()
    n_cand = len(cand_words)
    n_gold = len(gold_words)
    # Levenshtein distance, except we need the history, and we may want different
    # costs.
    # Mark operations with a string, and score the history using _edit_cost.
    previous_row = []
    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
    for i in range(n_gold + 1):
        cell = ''
        for j in range(i):
            cell += 'I'
        previous_row.append('I' * i)
        prev_costs[i] = i
    for i, cand in enumerate(cand_words):
        current_row = ['D' * (i + 1)]
        curr_costs[0] = i+1
        for j, gold in enumerate(gold_words):
            if gold.lower() == cand.lower():
                s_cost = prev_costs[j]
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + 1
            else:
                s_cost = prev_costs[j] + 1
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + (1 if cand else 0)

            if s_cost <= i_cost and s_cost <= d_cost:
                best_cost = s_cost
                best_hist = previous_row[j] + ('M' if gold == cand else 'S')
            elif i_cost <= s_cost and i_cost <= d_cost:
                best_cost = i_cost
                best_hist = current_row[j] + 'I'
            else:
                best_cost = d_cost
                best_hist = previous_row[j + 1] + 'D'
            
            current_row.append(best_hist)
            curr_costs[j+1] = best_cost
        previous_row = current_row
        for j in range(len(gold_words) + 1):
            prev_costs[j] = curr_costs[j]
            curr_costs[j] = 0

    return prev_costs[n_gold], previous_row[-1]


def read_json_file(loc, docs_filter=None):
    if path.isdir(loc):
        for filename in os.listdir(loc):
            yield from read_json_file(path.join(loc, filename))
    else:
        with io.open(loc, 'r', encoding='utf8') as file_:
            docs = json.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
                continue
            paragraphs = []
            for paragraph in doc['paragraphs']:
                sents = []
                for sent in paragraph['sentences']:
                    words = []
                    ids = []
                    tags = []
                    heads = []
                    labels = []
                    ner = []
                    for i, token in enumerate(sent['tokens']):
                        words.append(token['orth'])
                        ids.append(i)
                        tags.append(token.get('tag','-'))
                        heads.append(token.get('head',0) + i)
                        labels.append(token.get('dep',''))
                        # Ensure ROOT label is case-insensitive
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
                    sents.append((
                        (ids, words, tags, heads, labels, ner),
                        sent.get('brackets', [])))
                if sents:
                    yield (paragraph.get('raw', None), sents)


def _iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out


def _consume_os(tags):
    while tags and tags[0] == 'O':
        yield tags.pop(0)


def _consume_ent(tags):
    if not tags:
        return []
    target = tags.pop(0).replace('B', 'I')
    length = 1
    while tags and tags[0] == target:
        length += 1
        tags.pop(0)
    label = target[2:]
    if length == 1:
        return ['U-' + label]
    else:
        start = 'B-' + label
        end = 'L-' + label
        middle = ['I-%s' % label for _ in range(1, length - 1)]
        return [start] + middle + [end]


cdef class GoldParse:
    @classmethod
    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
        _, words, tags, heads, deps, entities = annot_tuples
        return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
                   make_projective=make_projective)

    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
        if words is None:
            words = [token.text for token in doc]
        if tags is None:
            tags = [None for _ in doc]
        if heads is None:
            heads = [token.i for token in doc]
        if deps is None:
            deps = [None for _ in doc]
        if entities is None:
            entities = [None for _ in doc]
        elif len(entities) == 0:
            entities = ['O' for _ in doc]
        elif not isinstance(entities[0], basestring):
            # Assume we have entities specified by character offset.
            entities = biluo_tags_from_offsets(doc, entities)

        self.mem = Pool()
        self.loss = 0
        self.length = len(doc)

        # These are filled by the tagger/parser/entity recogniser
        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

        self.tags = [None] * len(doc)
        self.heads = [None] * len(doc)
        self.labels = [''] * len(doc)
        self.ner = ['-'] * len(doc)

        self.cand_to_gold = align([t.orth_ for t in doc], words)
        self.gold_to_cand = align(words, [t.orth_ for t in doc])

        annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
        self.orig_annot = list(zip(*annot_tuples))

        for i, gold_i in enumerate(self.cand_to_gold):
            if doc[i].text.isspace():
                self.tags[i] = 'SP'
                self.heads[i] = None
                self.labels[i] = None
                self.ner[i] = 'O'
            if gold_i is None:
                pass
            else:
                self.tags[i] = tags[gold_i]
                self.heads[i] = self.gold_to_cand[heads[gold_i]]
                self.labels[i] = deps[gold_i]
                self.ner[i] = entities[gold_i]

        cycle = nonproj.contains_cycle(self.heads)
        if cycle != None:
            raise Exception("Cycle found: %s" % cycle)

        if make_projective:
            proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels)
            self.heads = proj_heads

    def __len__(self):
        return self.length

    @property
    def is_projective(self):
        return not nonproj.is_nonproj_tree(self.heads)


def biluo_tags_from_offsets(doc, entities):
    '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
    scheme (biluo).

    Arguments:
        doc (Doc):
            The document that the entity offsets refer to. The output tags will
            refer to the token boundaries within the document.

        entities (sequence):
            A sequence of (start, end, label) triples. start and end should be
            character-offset integers denoting the slice into the original string.
    
    Returns:
        tags (list):
            A list of unicode strings, describing the tags. Each tag string will
            be of the form either "", "O" or "{action}-{label}", where action is one
            of "B", "I", "L", "U". The string "-" is used where the entity
            offsets don't align with the tokenization in the Doc object. The
            training algorithm will view these as missing values. "O" denotes
            a non-entity token. "B" denotes the beginning of a multi-token entity,
            "I" the inside of an entity of three or more tokens, and "L" the end
            of an entity of two or more tokens. "U" denotes a single-token entity.

    Example:
        text = 'I like London.'
        entities = [(len('I like '), len('I like London'), 'LOC')]
        doc = nlp.tokenizer(text)

        tags = biluo_tags_from_offsets(doc, entities)
        
        assert tags == ['O', 'O', 'U-LOC', 'O']
    '''
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
    biluo = ['-' for _ in doc]
    # Handle entity cases
    for start_char, end_char, label in entities:
        start_token = starts.get(start_char)
        end_token = ends.get(end_char)
        # Only interested if the tokenization is correct
        if start_token is not None and end_token is not None:
            if start_token == end_token:
                biluo[start_token] = 'U-%s' % label
            else:
                biluo[start_token] = 'B-%s' % label
                for i in range(start_token+1, end_token):
                    biluo[i] = 'I-%s' % label
                biluo[end_token] = 'L-%s' % label
    # Now distinguish the O cases from ones where we miss the tokenization
    entity_chars = set()
    for start_char, end_char, label in entities:
        for i in range(start_char, end_char):
            entity_chars.add(i)
    for token in doc:
        for i in range(token.idx, token.idx+len(token)):
            if i in entity_chars:
                break
        else:
            biluo[token.i] = 'O'
    return biluo


def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`from __future__ import unicode_literals, print_function`

* Tmp 2015-03-09 08:46:22 +03:00			`import numpy`
caught more codecs.open -> io.open 2015-09-30 21:20:09 +03:00			`import io`
* Add read_json_file to conll.pyx 2015-05-06 17:27:31 +03:00			`import json`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`import random`
* Add functions for Levenshtein distance alignment 2015-05-24 22:50:48 +03:00			`import re`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 04:52:55 +03:00			`import os`
			`from os import path`
* Hacks to conll.pyx. Should clean these up. 2015-03-08 08:14:48 +03:00
* Tmp 2015-03-09 08:46:22 +03:00			`from libc.string cimport memset`
* Hacks to conll.pyx. Should clean these up. 2015-03-08 08:14:48 +03:00
Fix json loading, for Python 3. 2016-10-20 22:23:26 +03:00			`import ujson as json`
* Allow json to be used as a fallback if ujson is not available 2015-07-25 19:11:36 +03:00
integrated pseudo-projective parsing into parser - nonproj.pyx holds a class PseudoProjectivity which currently holds all functionality to implement Nivre & Nilsson 2005's pseudo-projective parsing using the HEAD decoration scheme - changed lefts/rights in Token to account for possible non-projective structures 2016-03-01 12:09:08 +03:00			`from .syntax import nonproj`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00
* Add file to hold GoldParse class 2015-02-21 19:06:58 +03:00
* Avoid shipping the spacy.munge package 2015-06-08 01:54:13 +03:00			`def tags_to_entities(tags):`
			`entities = []`
			`start = None`
			`for i, tag in enumerate(tags):`
			`if tag.startswith('O'):`
			`# TODO: We shouldn't be getting these malformed inputs. Fix this.`
			`if start is not None:`
			`start = None`
			`continue`
			`elif tag == '-':`
			`continue`
			`elif tag.startswith('I'):`
			`assert start is not None, tags[:i]`
			`continue`
			`if tag.startswith('U'):`
			`entities.append((tag[2:], i, i))`
			`elif tag.startswith('B'):`
			`start = i`
			`elif tag.startswith('L'):`
			`entities.append((tag[2:], start, i))`
			`start = None`
			`else:`
			`raise Exception(tag)`
			`return entities`


Move merge_sents method into spacy.gold 2016-10-13 04:24:29 +03:00			`def merge_sents(sents):`
			`m_deps = [[], [], [], [], [], []]`
			`m_brackets = []`
			`i = 0`
			`for (ids, words, tags, heads, labels, ner), brackets in sents:`
			`m_deps[0].extend(id_ + i for id_ in ids)`
			`m_deps[1].extend(words)`
			`m_deps[2].extend(tags)`
			`m_deps[3].extend(head + i for head in heads)`
			`m_deps[4].extend(labels)`
			`m_deps[5].extend(ner)`
			`m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)`
			`i += len(ids)`
			`return [(m_deps, m_brackets)]`

* Avoid shipping the spacy.munge package 2015-06-08 01:54:13 +03:00
* Add functions for Levenshtein distance alignment 2015-05-24 22:50:48 +03:00			`def align(cand_words, gold_words):`
			`cost, edit_path = _min_edit_path(cand_words, gold_words)`
			`alignment = []`
			`i_of_gold = 0`
			`for move in edit_path:`
			`if move == 'M':`
			`alignment.append(i_of_gold)`
			`i_of_gold += 1`
			`elif move == 'S':`
			`alignment.append(None)`
			`i_of_gold += 1`
			`elif move == 'D':`
			`alignment.append(None)`
			`elif move == 'I':`
			`i_of_gold += 1`
			`else:`
			`raise Exception(move)`
			`return alignment`


			`punct_re = re.compile(r'\W')`
			`def _min_edit_path(cand_words, gold_words):`
			`cdef:`
			`Pool mem`
			`int i, j, n_cand, n_gold`
			`int* curr_costs`
			`int* prev_costs`

			`# TODO: Fix this --- just do it properly, make the full edit matrix and`
			`# then walk back over it...`
			`# Preprocess inputs`
			`cand_words = [punct_re.sub('', w) for w in cand_words]`
			`gold_words = [punct_re.sub('', w) for w in gold_words]`
* Read input json in a streaming way 2015-05-27 20:13:11 +03:00
			`if cand_words == gold_words:`
* Fix output from _min_edit_path when inputs match. 2015-06-06 06:58:53 +03:00			`return 0, ''.join(['M' for _ in gold_words])`
* Read input json in a streaming way 2015-05-27 20:13:11 +03:00			`mem = Pool()`
* Add functions for Levenshtein distance alignment 2015-05-24 22:50:48 +03:00			`n_cand = len(cand_words)`
			`n_gold = len(gold_words)`
			`# Levenshtein distance, except we need the history, and we may want different`
			`# costs.`
			`# Mark operations with a string, and score the history using _edit_cost.`
			`previous_row = []`
			`prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))`
			`curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))`
			`for i in range(n_gold + 1):`
			`cell = ''`
			`for j in range(i):`
			`cell += 'I'`
			`previous_row.append('I' * i)`
			`prev_costs[i] = i`
			`for i, cand in enumerate(cand_words):`
			`current_row = ['D' * (i + 1)]`
			`curr_costs[0] = i+1`
			`for j, gold in enumerate(gold_words):`
			`if gold.lower() == cand.lower():`
			`s_cost = prev_costs[j]`
			`i_cost = curr_costs[j] + 1`
			`d_cost = prev_costs[j + 1] + 1`
			`else:`
			`s_cost = prev_costs[j] + 1`
			`i_cost = curr_costs[j] + 1`
			`d_cost = prev_costs[j + 1] + (1 if cand else 0)`

			`if s_cost <= i_cost and s_cost <= d_cost:`
			`best_cost = s_cost`
			`best_hist = previous_row[j] + ('M' if gold == cand else 'S')`
			`elif i_cost <= s_cost and i_cost <= d_cost:`
			`best_cost = i_cost`
			`best_hist = current_row[j] + 'I'`
			`else:`
			`best_cost = d_cost`
			`best_hist = previous_row[j + 1] + 'D'`

			`current_row.append(best_hist)`
			`curr_costs[j+1] = best_cost`
			`previous_row = current_row`
			`for j in range(len(gold_words) + 1):`
			`prev_costs[j] = curr_costs[j]`
			`curr_costs[j] = 0`

			`return prev_costs[n_gold], previous_row[-1]`

* Read input json in a streaming way 2015-05-27 20:13:11 +03:00
* Allow training documents to be filtered in gold.pyx 2015-06-12 03:42:08 +03:00			`def read_json_file(loc, docs_filter=None):`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 04:52:55 +03:00			`if path.isdir(loc):`
			`for filename in os.listdir(loc):`
			`yield from read_json_file(path.join(loc, filename))`
			`else:`
Fix json loading, for Python 3. 2016-10-20 22:23:26 +03:00			`with io.open(loc, 'r', encoding='utf8') as file_:`
* Allow json to be used as a fallback if ujson is not available 2015-07-25 19:11:36 +03:00			`docs = json.load(file_)`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 18:54:52 +03:00			`for doc in docs:`
* Allow training documents to be filtered in gold.pyx 2015-06-12 03:42:08 +03:00			`if docs_filter is not None and not docs_filter(doc):`
			`continue`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 18:54:52 +03:00			`paragraphs = []`
			`for paragraph in doc['paragraphs']:`
			`sents = []`
			`for sent in paragraph['sentences']:`
			`words = []`
			`ids = []`
			`tags = []`
			`heads = []`
			`labels = []`
			`ner = []`
			`for i, token in enumerate(sent['tokens']):`
			`words.append(token['orth'])`
			`ids.append(i)`
don't require json-files to have syntactic annotation 2016-04-22 17:32:27 +03:00			`tags.append(token.get('tag','-'))`
			`heads.append(token.get('head',0) + i)`
don't require read_json_file to expect particular annotations 2016-05-02 16:29:30 +03:00			`labels.append(token.get('dep',''))`
* Ensure root albel is spelled ROOT, for backwards compatibility 2015-06-23 05:14:03 +03:00			`# Ensure ROOT label is case-insensitive`
			`if labels[-1].lower() == 'root':`
			`labels[-1] = 'ROOT'`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 18:54:52 +03:00			`ner.append(token.get('ner', '-'))`
			`sents.append((`
			`(ids, words, tags, heads, labels, ner),`
			`sent.get('brackets', [])))`
			`if sents:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`yield (paragraph.get('raw', None), sents)`
* Add read_json_file to conll.pyx 2015-05-06 17:27:31 +03:00

* Add read_conll03_file function to conll.pyx 2015-04-10 05:59:11 +03:00			`def _iob_to_biluo(tags):`
			`out = []`
			`curr_label = None`
			`tags = list(tags)`
			`while tags:`
			`out.extend(_consume_os(tags))`
			`out.extend(_consume_ent(tags))`
			`return out`


			`def _consume_os(tags):`
			`while tags and tags[0] == 'O':`
			`yield tags.pop(0)`


			`def _consume_ent(tags):`
			`if not tags:`
			`return []`
			`target = tags.pop(0).replace('B', 'I')`
			`length = 1`
			`while tags and tags[0] == target:`
			`length += 1`
			`tags.pop(0)`
			`label = target[2:]`
			`if length == 1:`
			`return ['U-' + label]`
			`else:`
			`start = 'B-' + label`
			`end = 'L-' + label`
			`middle = ['I-%s' % label for _ in range(1, length - 1)]`
			`return [start] + middle + [end]`


* Tmp 2015-03-09 08:46:22 +03:00			`cdef class GoldParse:`
Draft a refactored init for the GoldParse class 2016-10-15 23:09:52 +03:00			`@classmethod`
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):`
			`_, words, tags, heads, deps, entities = annot_tuples`
			`return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,`
			`make_projective=make_projective)`

Fix GoldParse 2016-10-16 00:55:07 +03:00			`def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,`
			`deps=None, entities=None, make_projective=False):`
Draft a refactored init for the GoldParse class 2016-10-15 23:09:52 +03:00			`if words is None:`
			`words = [token.text for token in doc]`
			`if tags is None:`
			`tags = [None for _ in doc]`
			`if heads is None:`
Fix GoldParse class 2016-10-16 12:41:36 +03:00			`heads = [token.i for token in doc]`
Draft a refactored init for the GoldParse class 2016-10-15 23:09:52 +03:00			`if deps is None:`
			`deps = [None for _ in doc]`
			`if entities is None:`
			`entities = [None for _ in doc]`
			`elif len(entities) == 0:`
			`entities = ['O' for _ in doc]`
			`elif not isinstance(entities[0], basestring):`
			`# Assume we have entities specified by character offset.`
			`entities = biluo_tags_from_offsets(doc, entities)`

* Tmp 2015-03-09 08:46:22 +03:00			`self.mem = Pool()`
			`self.loss = 0`
Fix GoldParse 2016-10-16 00:55:07 +03:00			`self.length = len(doc)`
* Tmp 2015-03-09 08:46:22 +03:00
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 14:06:01 +03:00			`# These are filled by the tagger/parser/entity recogniser`
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))`
			`self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))`
			`self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))`
			`self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))`
* Tmp 2015-03-09 08:46:22 +03:00
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`self.tags = [None] * len(doc)`
			`self.heads = [None] * len(doc)`
			`self.labels = [''] * len(doc)`
			`self.ner = ['-'] * len(doc)`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`self.cand_to_gold = align([t.orth_ for t in doc], words)`
			`self.gold_to_cand = align(words, [t.orth_ for t in doc])`
* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer 2015-05-24 18:35:49 +03:00
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`annot_tuples = (range(len(words)), words, tags, heads, deps, entities)`
* Python3 correction for GoldParse 2015-07-28 15:44:53 +03:00			`self.orig_annot = list(zip(*annot_tuples))`
* Tmp 2015-03-09 08:46:22 +03:00
* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer 2015-05-24 18:35:49 +03:00			`for i, gold_i in enumerate(self.cand_to_gold):`
Fix GoldParse class 2016-10-16 12:41:36 +03:00			`if doc[i].text.isspace():`
* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity 2015-07-09 14:30:41 +03:00			`self.tags[i] = 'SP'`
			`self.heads[i] = None`
			`self.labels[i] = None`
			`self.ner[i] = 'O'`
Fix GoldParse class 2016-10-16 12:41:36 +03:00			`if gold_i is None:`
* Tmp 2015-03-09 08:46:22 +03:00			`pass`
* Tmp commit. Working on whole document parsing 2015-05-24 03:49:56 +03:00			`else:`
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`self.tags[i] = tags[gold_i]`
			`self.heads[i] = self.gold_to_cand[heads[gold_i]]`
			`self.labels[i] = deps[gold_i]`
			`self.ner[i] = entities[gold_i]`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 16:40:40 +03:00
			`cycle = nonproj.contains_cycle(self.heads)`
			`if cycle != None:`
			`raise Exception("Cycle found: %s" % cycle)`

* Allow gold parse to cut non-projective arcs 2015-05-31 02:11:56 +03:00			`if make_projective:`
Improve the API for the GoldParse class. 2016-10-16 00:53:29 +03:00			`proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads, self.labels)`
adjust train.py to train both english and german models 2016-03-03 17:21:00 +03:00			`self.heads = proj_heads`
* Add cycle-checking code in gold.pyx 2015-06-23 01:02:22 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 20:00:23 +03:00			`def __len__(self):`
			`return self.length`
* Tmp 2015-03-09 08:46:22 +03:00
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00			`@property`
			`def is_projective(self):`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 13:26:25 +03:00			`return not nonproj.is_nonproj_tree(self.heads)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-30 02:25:46 +03:00
* Add file to hold GoldParse class 2015-02-21 19:06:58 +03:00
Add function for entity->biluo transformation 2016-10-15 22:51:04 +03:00			`def biluo_tags_from_offsets(doc, entities):`
			`'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out`
			`scheme (biluo).`

			`Arguments:`
			`doc (Doc):`
			`The document that the entity offsets refer to. The output tags will`
			`refer to the token boundaries within the document.`

			`entities (sequence):`
			`A sequence of (start, end, label) triples. start and end should be`
			`character-offset integers denoting the slice into the original string.`

			`Returns:`
			`tags (list):`
			`A list of unicode strings, describing the tags. Each tag string will`
			`be of the form either "", "O" or "{action}-{label}", where action is one`
Fix GoldParse class 2016-10-16 12:41:36 +03:00			`of "B", "I", "L", "U". The string "-" is used where the entity`
Add function for entity->biluo transformation 2016-10-15 22:51:04 +03:00			`offsets don't align with the tokenization in the Doc object. The`
			`training algorithm will view these as missing values. "O" denotes`
			`a non-entity token. "B" denotes the beginning of a multi-token entity,`
			`"I" the inside of an entity of three or more tokens, and "L" the end`
			`of an entity of two or more tokens. "U" denotes a single-token entity.`

			`Example:`
			`text = 'I like London.'`
			`entities = [(len('I like '), len('I like London'), 'LOC')]`
			`doc = nlp.tokenizer(text)`

			`tags = biluo_tags_from_offsets(doc, entities)`

			`assert tags == ['O', 'O', 'U-LOC', 'O']`
			`'''`
			`starts = {token.idx: token.i for token in doc}`
			`ends = {token.idx+len(token): token.i for token in doc}`
Fix GoldParse class 2016-10-16 12:41:36 +03:00			`biluo = ['-' for _ in doc]`
Add function for entity->biluo transformation 2016-10-15 22:51:04 +03:00			`# Handle entity cases`
			`for start_char, end_char, label in entities:`
			`start_token = starts.get(start_char)`
			`end_token = ends.get(end_char)`
			`# Only interested if the tokenization is correct`
			`if start_token is not None and end_token is not None:`
			`if start_token == end_token:`
			`biluo[start_token] = 'U-%s' % label`
			`else:`
			`biluo[start_token] = 'B-%s' % label`
			`for i in range(start_token+1, end_token):`
			`biluo[i] = 'I-%s' % label`
			`biluo[end_token] = 'L-%s' % label`
			`# Now distinguish the O cases from ones where we miss the tokenization`
			`entity_chars = set()`
			`for start_char, end_char, label in entities:`
			`for i in range(start_char, end_char):`
			`entity_chars.add(i)`
			`for token in doc:`
			`for i in range(token.idx, token.idx+len(token)):`
			`if i in entity_chars:`
			`break`
			`else:`
			`biluo[token.i] = 'O'`
			`return biluo`


* Add file to hold GoldParse class 2015-02-21 19:06:58 +03:00			`def is_punct_label(label):`
			`return label == 'P' or label.lower() == 'punct'`