mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
563f46f026
The TextCategorizer class is supposed to support multi-label text classification, and allow training data to contain missing values. For this to work, the gradient of the loss should be 0 when labels are missing. Instead, there was no way to actually denote "missing" in the GoldParse class, and so the TextCategorizer class treated the label set within gold.cats as complete. To fix this, we change GoldParse.cats to be a dict instead of a list. The GoldParse.cats dict should map to floats, with 1. denoting 'present' and 0. denoting 'absent'. Gradients are zeroed for categories absent from the gold.cats dict. A nice bonus is that you can also set values between 0 and 1 for partial membership. You can also set numeric values, if you're using a text classification model that uses an appropriate loss function. Unfortunately this is a breaking change; although the functionality was only recently introduced and hasn't been properly documented yet. I've updated the example script accordingly.
557 lines
20 KiB
Cython
557 lines
20 KiB
Cython
# cython: profile=True
|
|
# coding: utf8
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import io
|
|
import re
|
|
import ujson
|
|
import random
|
|
import cytoolz
|
|
import itertools
|
|
|
|
from .syntax import nonproj
|
|
from .util import ensure_path
|
|
from . import util
|
|
from .tokens import Doc
|
|
|
|
|
|
def tags_to_entities(tags):
|
|
entities = []
|
|
start = None
|
|
for i, tag in enumerate(tags):
|
|
if tag is None:
|
|
continue
|
|
if tag.startswith('O'):
|
|
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
|
if start is not None:
|
|
start = None
|
|
continue
|
|
elif tag == '-':
|
|
continue
|
|
elif tag.startswith('I'):
|
|
assert start is not None, tags[:i]
|
|
continue
|
|
if tag.startswith('U'):
|
|
entities.append((tag[2:], i, i))
|
|
elif tag.startswith('B'):
|
|
start = i
|
|
elif tag.startswith('L'):
|
|
entities.append((tag[2:], start, i))
|
|
start = None
|
|
else:
|
|
raise Exception(tag)
|
|
return entities
|
|
|
|
|
|
def merge_sents(sents):
|
|
m_deps = [[], [], [], [], [], []]
|
|
m_brackets = []
|
|
i = 0
|
|
for (ids, words, tags, heads, labels, ner), brackets in sents:
|
|
m_deps[0].extend(id_ + i for id_ in ids)
|
|
m_deps[1].extend(words)
|
|
m_deps[2].extend(tags)
|
|
m_deps[3].extend(head + i for head in heads)
|
|
m_deps[4].extend(labels)
|
|
m_deps[5].extend(ner)
|
|
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
|
|
i += len(ids)
|
|
return [(m_deps, m_brackets)]
|
|
|
|
|
|
def align(cand_words, gold_words):
|
|
cost, edit_path = _min_edit_path(cand_words, gold_words)
|
|
alignment = []
|
|
i_of_gold = 0
|
|
for move in edit_path:
|
|
if move == 'M':
|
|
alignment.append(i_of_gold)
|
|
i_of_gold += 1
|
|
elif move == 'S':
|
|
alignment.append(None)
|
|
i_of_gold += 1
|
|
elif move == 'D':
|
|
alignment.append(None)
|
|
elif move == 'I':
|
|
i_of_gold += 1
|
|
else:
|
|
raise Exception(move)
|
|
return alignment
|
|
|
|
|
|
punct_re = re.compile(r'\W')
|
|
def _min_edit_path(cand_words, gold_words):
|
|
cdef:
|
|
Pool mem
|
|
int i, j, n_cand, n_gold
|
|
int* curr_costs
|
|
int* prev_costs
|
|
|
|
# TODO: Fix this --- just do it properly, make the full edit matrix and
|
|
# then walk back over it...
|
|
# Preprocess inputs
|
|
cand_words = [punct_re.sub('', w).lower() for w in cand_words]
|
|
gold_words = [punct_re.sub('', w).lower() for w in gold_words]
|
|
|
|
if cand_words == gold_words:
|
|
return 0, ''.join(['M' for _ in gold_words])
|
|
mem = Pool()
|
|
n_cand = len(cand_words)
|
|
n_gold = len(gold_words)
|
|
# Levenshtein distance, except we need the history, and we may want different
|
|
# costs.
|
|
# Mark operations with a string, and score the history using _edit_cost.
|
|
previous_row = []
|
|
prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
|
curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
|
|
for i in range(n_gold + 1):
|
|
cell = ''
|
|
for j in range(i):
|
|
cell += 'I'
|
|
previous_row.append('I' * i)
|
|
prev_costs[i] = i
|
|
for i, cand in enumerate(cand_words):
|
|
current_row = ['D' * (i + 1)]
|
|
curr_costs[0] = i+1
|
|
for j, gold in enumerate(gold_words):
|
|
if gold.lower() == cand.lower():
|
|
s_cost = prev_costs[j]
|
|
i_cost = curr_costs[j] + 1
|
|
d_cost = prev_costs[j + 1] + 1
|
|
else:
|
|
s_cost = prev_costs[j] + 1
|
|
i_cost = curr_costs[j] + 1
|
|
d_cost = prev_costs[j + 1] + (1 if cand else 0)
|
|
|
|
if s_cost <= i_cost and s_cost <= d_cost:
|
|
best_cost = s_cost
|
|
best_hist = previous_row[j] + ('M' if gold == cand else 'S')
|
|
elif i_cost <= s_cost and i_cost <= d_cost:
|
|
best_cost = i_cost
|
|
best_hist = current_row[j] + 'I'
|
|
else:
|
|
best_cost = d_cost
|
|
best_hist = previous_row[j + 1] + 'D'
|
|
|
|
current_row.append(best_hist)
|
|
curr_costs[j+1] = best_cost
|
|
previous_row = current_row
|
|
for j in range(len(gold_words) + 1):
|
|
prev_costs[j] = curr_costs[j]
|
|
curr_costs[j] = 0
|
|
|
|
return prev_costs[n_gold], previous_row[-1]
|
|
|
|
|
|
def minibatch(items, size=8):
|
|
'''Iterate over batches of items. `size` may be an iterator,
|
|
so that batch-size can vary on each step.
|
|
'''
|
|
if isinstance(size, int):
|
|
size_ = itertools.repeat(8)
|
|
else:
|
|
size_ = size
|
|
items = iter(items)
|
|
while True:
|
|
batch_size = next(size_)
|
|
batch = list(cytoolz.take(int(batch_size), items))
|
|
if len(batch) == 0:
|
|
break
|
|
yield list(batch)
|
|
|
|
|
|
class GoldCorpus(object):
|
|
"""An annotated corpus, using the JSON file format. Manages
|
|
annotations for tagging, dependency parsing and NER."""
|
|
def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
|
|
"""Create a GoldCorpus.
|
|
|
|
train_path (unicode or Path): File or directory of training data.
|
|
dev_path (unicode or Path): File or directory of development data.
|
|
"""
|
|
self.train_path = util.ensure_path(train_path)
|
|
self.dev_path = util.ensure_path(dev_path)
|
|
self.limit = limit
|
|
self.train_locs = self.walk_corpus(self.train_path)
|
|
self.dev_locs = self.walk_corpus(self.dev_path)
|
|
|
|
@property
|
|
def train_tuples(self):
|
|
i = 0
|
|
for loc in self.train_locs:
|
|
gold_tuples = read_json_file(loc)
|
|
for item in gold_tuples:
|
|
yield item
|
|
i += len(item[1])
|
|
if self.limit and i >= self.limit:
|
|
break
|
|
|
|
@property
|
|
def dev_tuples(self):
|
|
i = 0
|
|
for loc in self.dev_locs:
|
|
gold_tuples = read_json_file(loc)
|
|
for item in gold_tuples:
|
|
yield item
|
|
i += 1
|
|
if self.limit and i >= self.limit:
|
|
break
|
|
|
|
def count_train(self):
|
|
n = 0
|
|
i = 0
|
|
for raw_text, paragraph_tuples in self.train_tuples:
|
|
n += sum([len(s[0][1]) for s in paragraph_tuples])
|
|
if self.limit and i >= self.limit:
|
|
break
|
|
i += len(paragraph_tuples)
|
|
return n
|
|
|
|
def train_docs(self, nlp, gold_preproc=False,
|
|
projectivize=False, max_length=None,
|
|
noise_level=0.0):
|
|
train_tuples = self.train_tuples
|
|
if projectivize:
|
|
train_tuples = nonproj.preprocess_training_data(
|
|
self.train_tuples)
|
|
random.shuffle(train_tuples)
|
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
|
max_length=max_length,
|
|
noise_level=noise_level)
|
|
yield from gold_docs
|
|
|
|
def dev_docs(self, nlp, gold_preproc=False):
|
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
|
#gold_docs = nlp.preprocess_gold(gold_docs)
|
|
yield from gold_docs
|
|
|
|
@classmethod
|
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
|
noise_level=0.0):
|
|
for raw_text, paragraph_tuples in tuples:
|
|
if gold_preproc:
|
|
raw_text = None
|
|
else:
|
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
|
|
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
|
gold_preproc, noise_level=noise_level)
|
|
golds = cls._make_golds(docs, paragraph_tuples)
|
|
for doc, gold in zip(docs, golds):
|
|
if (not max_length) or len(doc) < max_length:
|
|
yield doc, gold
|
|
|
|
@classmethod
|
|
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
|
|
noise_level=0.0):
|
|
if raw_text is not None:
|
|
raw_text = add_noise(raw_text, noise_level)
|
|
return [nlp.make_doc(raw_text)]
|
|
else:
|
|
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
|
for (sent_tuples, brackets) in paragraph_tuples]
|
|
|
|
@classmethod
|
|
def _make_golds(cls, docs, paragraph_tuples):
|
|
assert len(docs) == len(paragraph_tuples)
|
|
if len(docs) == 1:
|
|
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
|
|
else:
|
|
return [GoldParse.from_annot_tuples(doc, sent_tuples)
|
|
for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
|
|
|
|
@staticmethod
|
|
def walk_corpus(path):
|
|
if not path.is_dir():
|
|
return [path]
|
|
paths = [path]
|
|
locs = []
|
|
seen = set()
|
|
for path in paths:
|
|
if str(path) in seen:
|
|
continue
|
|
seen.add(str(path))
|
|
if path.parts[-1].startswith('.'):
|
|
continue
|
|
elif path.is_dir():
|
|
paths.extend(path.iterdir())
|
|
elif path.parts[-1].endswith('.json'):
|
|
locs.append(path)
|
|
return locs
|
|
|
|
|
|
def add_noise(orig, noise_level):
|
|
if random.random() >= noise_level:
|
|
return orig
|
|
elif type(orig) == list:
|
|
corrupted = [_corrupt(word, noise_level) for word in orig]
|
|
corrupted = [w for w in corrupted if w]
|
|
return corrupted
|
|
else:
|
|
return ''.join(_corrupt(c, noise_level) for c in orig)
|
|
|
|
|
|
def _corrupt(c, noise_level):
|
|
if random.random() >= noise_level:
|
|
return c
|
|
elif c == ' ':
|
|
return '\n'
|
|
elif c == '\n':
|
|
return ' '
|
|
elif c in ['.', "'", "!", "?"]:
|
|
return ''
|
|
else:
|
|
return c.lower()
|
|
|
|
|
|
def read_json_file(loc, docs_filter=None, limit=None):
|
|
loc = ensure_path(loc)
|
|
if loc.is_dir():
|
|
for filename in loc.iterdir():
|
|
yield from read_json_file(loc / filename, limit=limit)
|
|
else:
|
|
with loc.open('r', encoding='utf8') as file_:
|
|
docs = ujson.load(file_)
|
|
if limit is not None:
|
|
docs = docs[:limit]
|
|
for doc in docs:
|
|
if docs_filter is not None and not docs_filter(doc):
|
|
continue
|
|
paragraphs = []
|
|
for paragraph in doc['paragraphs']:
|
|
sents = []
|
|
for sent in paragraph['sentences']:
|
|
words = []
|
|
ids = []
|
|
tags = []
|
|
heads = []
|
|
labels = []
|
|
ner = []
|
|
for i, token in enumerate(sent['tokens']):
|
|
words.append(token['orth'])
|
|
ids.append(i)
|
|
tags.append(token.get('tag','-'))
|
|
heads.append(token.get('head',0) + i)
|
|
labels.append(token.get('dep',''))
|
|
# Ensure ROOT label is case-insensitive
|
|
if labels[-1].lower() == 'root':
|
|
labels[-1] = 'ROOT'
|
|
ner.append(token.get('ner', '-'))
|
|
sents.append([
|
|
[ids, words, tags, heads, labels, ner],
|
|
sent.get('brackets', [])])
|
|
if sents:
|
|
yield [paragraph.get('raw', None), sents]
|
|
|
|
|
|
def iob_to_biluo(tags):
|
|
out = []
|
|
curr_label = None
|
|
tags = list(tags)
|
|
while tags:
|
|
out.extend(_consume_os(tags))
|
|
out.extend(_consume_ent(tags))
|
|
return out
|
|
|
|
|
|
def _consume_os(tags):
|
|
while tags and tags[0] == 'O':
|
|
yield tags.pop(0)
|
|
|
|
|
|
def _consume_ent(tags):
|
|
if not tags:
|
|
return []
|
|
target = tags.pop(0).replace('B', 'I')
|
|
length = 1
|
|
while tags and tags[0] == target:
|
|
length += 1
|
|
tags.pop(0)
|
|
label = target[2:]
|
|
if length == 1:
|
|
return ['U-' + label]
|
|
else:
|
|
start = 'B-' + label
|
|
end = 'L-' + label
|
|
middle = ['I-%s' % label for _ in range(1, length - 1)]
|
|
return [start] + middle + [end]
|
|
|
|
|
|
cdef class GoldParse:
|
|
"""Collection for training annotations."""
|
|
@classmethod
|
|
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
|
_, words, tags, heads, deps, entities = annot_tuples
|
|
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
|
|
make_projective=make_projective)
|
|
|
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
|
deps=None, entities=None, make_projective=False,
|
|
cats=None):
|
|
"""Create a GoldParse.
|
|
|
|
doc (Doc): The document the annotations refer to.
|
|
words (iterable): A sequence of unicode word strings.
|
|
tags (iterable): A sequence of strings, representing tag annotations.
|
|
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
|
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
|
entities (iterable): A sequence of named entity annotations, either as
|
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
|
representing the entity positions.
|
|
cats (dict): Labels for text classification. Each key in the dictionary
|
|
may be a string or an int, or a `(start_char, end_char, label)`
|
|
tuple, indicating that the label is applied to only part of the
|
|
document (usually a sentence). Unlike entity annotations, label
|
|
annotations can overlap, i.e. a single word can be covered by
|
|
multiple labelled spans. The TextCategorizer component expects
|
|
true examples of a label to have the value 1.0, and negative examples
|
|
of a label to have the value 0.0. Labels not in the dictionary are
|
|
treated as missing -- the gradient for those labels will be zero.
|
|
RETURNS (GoldParse): The newly constructed object.
|
|
"""
|
|
if words is None:
|
|
words = [token.text for token in doc]
|
|
if tags is None:
|
|
tags = [None for _ in doc]
|
|
if heads is None:
|
|
heads = [None for token in doc]
|
|
if deps is None:
|
|
deps = [None for _ in doc]
|
|
if entities is None:
|
|
entities = [None for _ in doc]
|
|
elif len(entities) == 0:
|
|
entities = ['O' for _ in doc]
|
|
elif not isinstance(entities[0], basestring):
|
|
# Assume we have entities specified by character offset.
|
|
entities = biluo_tags_from_offsets(doc, entities)
|
|
|
|
self.mem = Pool()
|
|
self.loss = 0
|
|
self.length = len(doc)
|
|
|
|
# These are filled by the tagger/parser/entity recogniser
|
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
|
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
|
|
|
self.cats = {} if cats is None else dict(cats)
|
|
self.words = [None] * len(doc)
|
|
self.tags = [None] * len(doc)
|
|
self.heads = [None] * len(doc)
|
|
self.labels = [None] * len(doc)
|
|
self.ner = [None] * len(doc)
|
|
|
|
self.cand_to_gold = align([t.orth_ for t in doc], words)
|
|
self.gold_to_cand = align(words, [t.orth_ for t in doc])
|
|
|
|
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
|
self.orig_annot = list(zip(*annot_tuples))
|
|
|
|
for i, gold_i in enumerate(self.cand_to_gold):
|
|
if doc[i].text.isspace():
|
|
self.words[i] = doc[i].text
|
|
self.tags[i] = 'SP'
|
|
self.heads[i] = None
|
|
self.labels[i] = None
|
|
self.ner[i] = 'O'
|
|
if gold_i is None:
|
|
pass
|
|
else:
|
|
self.words[i] = words[gold_i]
|
|
self.tags[i] = tags[gold_i]
|
|
if heads[gold_i] is None:
|
|
self.heads[i] = None
|
|
else:
|
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
|
self.labels[i] = deps[gold_i]
|
|
self.ner[i] = entities[gold_i]
|
|
|
|
cycle = nonproj.contains_cycle(self.heads)
|
|
if cycle != None:
|
|
raise Exception("Cycle found: %s" % cycle)
|
|
|
|
if make_projective:
|
|
proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
|
|
self.heads = proj_heads
|
|
|
|
def __len__(self):
|
|
"""Get the number of gold-standard tokens.
|
|
|
|
RETURNS (int): The number of gold-standard tokens.
|
|
"""
|
|
return self.length
|
|
|
|
@property
|
|
def is_projective(self):
|
|
"""Whether the provided syntactic annotations form a projective
|
|
dependency tree.
|
|
"""
|
|
return not nonproj.is_nonproj_tree(self.heads)
|
|
|
|
@property
|
|
def sent_starts(self):
|
|
return [self.c.sent_start[i] for i in range(self.length)]
|
|
|
|
|
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
|
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
|
scheme (BILUO).
|
|
|
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
|
will refer to the token boundaries within the document.
|
|
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
|
`end` should be character-offset integers denoting the slice into the
|
|
original string.
|
|
|
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
|
string will be of the form either "", "O" or "{action}-{label}", where
|
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
|
entity offsets don't align with the tokenization in the `Doc` object. The
|
|
training algorithm will view these as missing values. "O" denotes a
|
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
|
|
|
EXAMPLE:
|
|
>>> text = 'I like London.'
|
|
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
|
>>> doc = nlp.tokenizer(text)
|
|
>>> tags = biluo_tags_from_offsets(doc, entities)
|
|
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
|
|
"""
|
|
starts = {token.idx: token.i for token in doc}
|
|
ends = {token.idx+len(token): token.i for token in doc}
|
|
biluo = ['-' for _ in doc]
|
|
# Handle entity cases
|
|
for start_char, end_char, label in entities:
|
|
start_token = starts.get(start_char)
|
|
end_token = ends.get(end_char)
|
|
# Only interested if the tokenization is correct
|
|
if start_token is not None and end_token is not None:
|
|
if start_token == end_token:
|
|
biluo[start_token] = 'U-%s' % label
|
|
else:
|
|
biluo[start_token] = 'B-%s' % label
|
|
for i in range(start_token+1, end_token):
|
|
biluo[i] = 'I-%s' % label
|
|
biluo[end_token] = 'L-%s' % label
|
|
# Now distinguish the O cases from ones where we miss the tokenization
|
|
entity_chars = set()
|
|
for start_char, end_char, label in entities:
|
|
for i in range(start_char, end_char):
|
|
entity_chars.add(i)
|
|
for token in doc:
|
|
for i in range(token.idx, token.idx+len(token)):
|
|
if i in entity_chars:
|
|
break
|
|
else:
|
|
biluo[token.i] = missing
|
|
return biluo
|
|
|
|
|
|
def is_punct_label(label):
|
|
return label == 'P' or label.lower() == 'punct'
|