mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Resolve fine-tuning conflict
This commit is contained in:
commit
43210abacc
|
@ -1 +1,56 @@
|
|||
environment:
|
||||
|
||||
matrix:
|
||||
|
||||
# For Python versions available on Appveyor, see
|
||||
# http://www.appveyor.com/docs/installed-software#python
|
||||
# The list here is complete (excluding Python 2.6, which
|
||||
# isn't covered by this document) at the time of writing.
|
||||
|
||||
- PYTHON: "C:\\Python27"
|
||||
#- PYTHON: "C:\\Python33"
|
||||
#- PYTHON: "C:\\Python34"
|
||||
#- PYTHON: "C:\\Python35"
|
||||
#- PYTHON: "C:\\Python27-x64"
|
||||
#- PYTHON: "C:\\Python33-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python34-x64"
|
||||
#- DISTUTILS_USE_SDK: "1"
|
||||
#- PYTHON: "C:\\Python35-x64"
|
||||
- PYTHON: "C:\\Python36-x64"
|
||||
|
||||
install:
|
||||
# We need wheel installed to build wheels
|
||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
||||
- "%PYTHON%\\python.exe -m pip install cython"
|
||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
||||
- "%PYTHON%\\python.exe setup.py build_ext --inplace"
|
||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
||||
|
||||
build: off
|
||||
|
||||
test_script:
|
||||
# Put your test command here.
|
||||
# If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
|
||||
# you can remove "build.cmd" from the front of the command, as it's
|
||||
# only needed to support those cases.
|
||||
# Note that you must use the environment variable %PYTHON% to refer to
|
||||
# the interpreter you're using - Appveyor does not do anything special
|
||||
# to put the Python version you want to use on PATH.
|
||||
- "%PYTHON%\\python.exe -m pytest spacy/"
|
||||
|
||||
after_test:
|
||||
# This step builds your wheels.
|
||||
# Again, you only need build.cmd if you're building C extensions for
|
||||
# 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
|
||||
# interpreter
|
||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
||||
|
||||
artifacts:
|
||||
# bdist_wheel puts your built wheel in the dist directory
|
||||
- path: dist\*
|
||||
|
||||
#on_success:
|
||||
# You can use this step to upload your artifacts to a public website.
|
||||
# See Appveyor's documentation for more details. Or you can simply
|
||||
# access your wheels from the Appveyor "artifacts" tab for your build.
|
||||
|
|
|
@ -13,24 +13,27 @@ Input data:
|
|||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
|
||||
|
||||
Developed for: spaCy 1.7.1
|
||||
Last tested for: spaCy 1.7.1
|
||||
Last tested for: spaCy 2.0.0a13
|
||||
'''
|
||||
from __future__ import unicode_literals, print_function
|
||||
import plac
|
||||
from pathlib import Path
|
||||
import random
|
||||
import json
|
||||
from thinc.neural.optimizers import Adam
|
||||
from thinc.neural.ops import NumpyOps
|
||||
import tqdm
|
||||
|
||||
import spacy.orth as orth_funcs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import BeamEntityRecognizer
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import *
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
|
||||
from spacy.gold import iob_to_biluo
|
||||
from spacy.gold import minibatch
|
||||
from spacy.scorer import Scorer
|
||||
import spacy.util
|
||||
|
||||
try:
|
||||
unicode
|
||||
|
@ -38,95 +41,40 @@ except NameError:
|
|||
unicode = str
|
||||
|
||||
|
||||
spacy.util.set_env_log(True)
|
||||
|
||||
|
||||
def init_vocab():
|
||||
return Vocab(
|
||||
lex_attr_getters={
|
||||
LOWER: lambda string: string.lower(),
|
||||
SHAPE: orth_funcs.word_shape,
|
||||
NORM: lambda string: string.lower(),
|
||||
PREFIX: lambda string: string[0],
|
||||
SUFFIX: lambda string: string[-3:],
|
||||
CLUSTER: lambda string: 0,
|
||||
IS_ALPHA: orth_funcs.is_alpha,
|
||||
IS_ASCII: orth_funcs.is_ascii,
|
||||
IS_DIGIT: lambda string: string.isdigit(),
|
||||
IS_LOWER: orth_funcs.is_lower,
|
||||
IS_PUNCT: orth_funcs.is_punct,
|
||||
IS_SPACE: lambda string: string.isspace(),
|
||||
IS_TITLE: orth_funcs.is_title,
|
||||
IS_UPPER: orth_funcs.is_upper,
|
||||
IS_STOP: lambda string: False,
|
||||
IS_OOV: lambda string: True
|
||||
})
|
||||
|
||||
|
||||
def save_vocab(vocab, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save vocab to %s\nNot a directory" % path)
|
||||
with (path / 'strings.json').open('w') as file_:
|
||||
vocab.strings.dump(file_)
|
||||
vocab.dump((path / 'lexemes.bin').as_posix())
|
||||
|
||||
|
||||
def load_vocab(path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load vocab from %s\nNot a directory" % path)
|
||||
return Vocab.load(path)
|
||||
|
||||
|
||||
def init_ner_model(vocab, features=None):
|
||||
if features is None:
|
||||
features = tuple(EntityRecognizer.feature_templates)
|
||||
return EntityRecognizer(vocab, features=features)
|
||||
|
||||
|
||||
def save_ner_model(model, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
if not path.is_dir():
|
||||
raise IOError("Can't save model to %s\nNot a directory" % path)
|
||||
model.model.dump((path / 'model').as_posix())
|
||||
with (path / 'config.json').open('w') as file_:
|
||||
data = json.dumps(model.cfg)
|
||||
if not isinstance(data, unicode):
|
||||
data = data.decode('utf8')
|
||||
file_.write(data)
|
||||
|
||||
|
||||
def load_ner_model(vocab, path):
|
||||
return EntityRecognizer.load(path, vocab)
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
@classmethod
|
||||
def load(cls, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
vocab = load_vocab(path)
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
ner_model = load_ner_model(vocab, path / 'ner')
|
||||
return cls(vocab, tokenizer, ner_model)
|
||||
|
||||
def __init__(self, vocab=None, tokenizer=None, entity=None):
|
||||
def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
|
||||
if vocab is None:
|
||||
vocab = init_vocab()
|
||||
if tokenizer is None:
|
||||
tokenizer = Tokenizer(vocab, {}, None, None, None)
|
||||
if tensorizer is None:
|
||||
tensorizer = TokenVectorEncoder(vocab)
|
||||
if entity is None:
|
||||
entity = init_ner_model(self.vocab)
|
||||
entity = NeuralEntityRecognizer(vocab)
|
||||
self.vocab = vocab
|
||||
self.tokenizer = tokenizer
|
||||
self.tensorizer = tensorizer
|
||||
self.entity = entity
|
||||
self.pipeline = [self.entity]
|
||||
self.pipeline = [tensorizer, self.entity]
|
||||
|
||||
def begin_training(self):
|
||||
for model in self.pipeline:
|
||||
model.begin_training([])
|
||||
optimizer = Adam(NumpyOps(), 0.001)
|
||||
return optimizer
|
||||
|
||||
def __call__(self, input_):
|
||||
doc = self.make_doc(input_)
|
||||
|
@ -147,14 +95,18 @@ class Pipeline(object):
|
|||
gold = GoldParse(doc, entities=annotations)
|
||||
return gold
|
||||
|
||||
def update(self, input_, annot):
|
||||
doc = self.make_doc(input_)
|
||||
gold = self.make_gold(input_, annot)
|
||||
for ner in gold.ner:
|
||||
if ner not in (None, '-', 'O'):
|
||||
action, label = ner.split('-', 1)
|
||||
self.entity.add_label(label)
|
||||
return self.entity.update(doc, gold)
|
||||
def update(self, inputs, annots, sgd, losses=None, drop=0.):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
docs = [self.make_doc(input_) for input_ in inputs]
|
||||
golds = [self.make_gold(input_, annot) for input_, annot in
|
||||
zip(inputs, annots)]
|
||||
|
||||
tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
|
||||
d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
|
||||
sgd=sgd, losses=losses)
|
||||
bp_tensors(d_tensors, sgd=sgd)
|
||||
return losses
|
||||
|
||||
def evaluate(self, examples):
|
||||
scorer = Scorer()
|
||||
|
@ -164,34 +116,38 @@ class Pipeline(object):
|
|||
scorer.score(doc, gold)
|
||||
return scorer.scores
|
||||
|
||||
def average_weights(self):
|
||||
self.entity.model.end_training()
|
||||
|
||||
def save(self, path):
|
||||
def to_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
elif not path.is_dir():
|
||||
raise IOError("Can't save pipeline to %s\nNot a directory" % path)
|
||||
save_vocab(self.vocab, path / 'vocab')
|
||||
save_ner_model(self.entity, path / 'ner')
|
||||
self.vocab.to_disk(path / 'vocab')
|
||||
self.tensorizer.to_disk(path / 'tensorizer')
|
||||
self.entity.to_disk(path / 'ner')
|
||||
|
||||
def from_disk(self, path):
|
||||
path = Path(path)
|
||||
if not path.exists():
|
||||
raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
|
||||
if not path.is_dir():
|
||||
raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
|
||||
self.vocab = self.vocab.from_disk(path / 'vocab')
|
||||
self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
|
||||
self.entity = self.entity.from_disk(path / 'ner')
|
||||
|
||||
|
||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
|
||||
next_epoch = train_examples
|
||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
|
||||
sgd = nlp.begin_training()
|
||||
print("Iter", "Loss", "P", "R", "F")
|
||||
for i in range(nr_epoch):
|
||||
this_epoch = next_epoch
|
||||
next_epoch = []
|
||||
loss = 0
|
||||
for input_, annot in this_epoch:
|
||||
loss += nlp.update(input_, annot)
|
||||
if (i+1) < nr_epoch:
|
||||
next_epoch.append((input_, annot))
|
||||
random.shuffle(next_epoch)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
|
||||
inputs, annots = zip(*batch)
|
||||
nlp.update(list(inputs), list(annots), sgd, losses=losses)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(i, loss, scores)
|
||||
nlp.average_weights()
|
||||
report_scores(i, losses['ner'], scores)
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
report_scores(channels, i+1, loss, scores)
|
||||
|
||||
|
@ -208,7 +164,8 @@ def read_examples(path):
|
|||
with path.open() as file_:
|
||||
sents = file_.read().strip().split('\n\n')
|
||||
for sent in sents:
|
||||
if not sent.strip():
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
tokens = sent.split('\n')
|
||||
while tokens and tokens[0].startswith('#'):
|
||||
|
@ -217,28 +174,39 @@ def read_examples(path):
|
|||
iob = []
|
||||
for token in tokens:
|
||||
if token.strip():
|
||||
pieces = token.split()
|
||||
pieces = token.split('\t')
|
||||
words.append(pieces[1])
|
||||
iob.append(pieces[2])
|
||||
yield words, iob_to_biluo(iob)
|
||||
|
||||
|
||||
def get_labels(examples):
|
||||
labels = set()
|
||||
for words, tags in examples:
|
||||
for tag in tags:
|
||||
if '-' in tag:
|
||||
labels.add(tag.split('-')[1])
|
||||
return sorted(labels)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model_dir=("Path to save the model", "positional", None, Path),
|
||||
train_loc=("Path to your training data", "positional", None, Path),
|
||||
dev_loc=("Path to your development data", "positional", None, Path),
|
||||
)
|
||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
|
||||
train_loc=None, dev_loc=None, nr_epoch=30):
|
||||
|
||||
train_examples = read_examples(train_loc)
|
||||
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
|
||||
print(model_dir, train_loc, dev_loc)
|
||||
train_examples = list(read_examples(train_loc))
|
||||
dev_examples = read_examples(dev_loc)
|
||||
nlp = Pipeline.load(model_dir)
|
||||
nlp = Pipeline()
|
||||
for label in get_labels(train_examples):
|
||||
nlp.entity.add_label(label)
|
||||
print("Add label", label)
|
||||
|
||||
train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
|
||||
train(nlp, train_examples, list(dev_examples), nr_epoch)
|
||||
|
||||
nlp.save(model_dir)
|
||||
nlp.to_disk(model_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
plac.call(main)
|
||||
|
|
|
@ -25,7 +25,7 @@ For more details, see the documentation:
|
|||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
|
||||
|
||||
Developed for: spaCy 1.7.6
|
||||
Last tested for: spaCy 1.7.6
|
||||
Last updated for: spaCy 2.0.0a13
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -34,55 +34,41 @@ from pathlib import Path
|
|||
import random
|
||||
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.gold import GoldParse, minibatch
|
||||
from spacy.pipeline import NeuralEntityRecognizer
|
||||
from spacy.pipeline import TokenVectorEncoder
|
||||
|
||||
|
||||
def get_gold_parses(tokenizer, train_data):
|
||||
'''Shuffle and create GoldParse objects'''
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
doc = tokenizer(raw_text)
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
yield doc, gold
|
||||
|
||||
|
||||
def train_ner(nlp, train_data, output_dir):
|
||||
# Add new words to vocab
|
||||
for raw_text, _ in train_data:
|
||||
doc = nlp.make_doc(raw_text)
|
||||
for word in doc:
|
||||
_ = nlp.vocab[word.orth]
|
||||
random.seed(0)
|
||||
# You may need to change the learning rate. It's generally difficult to
|
||||
# guess what rate you should set, especially when you have limited data.
|
||||
nlp.entity.model.learn_rate = 0.001
|
||||
for itn in range(1000):
|
||||
random.shuffle(train_data)
|
||||
loss = 0.
|
||||
for raw_text, entity_offsets in train_data:
|
||||
gold = GoldParse(doc, entities=entity_offsets)
|
||||
# By default, the GoldParse class assumes that the entities
|
||||
# described by offset are complete, and all other words should
|
||||
# have the tag 'O'. You can tell it to make no assumptions
|
||||
# about the tag of a word by giving it the tag '-'.
|
||||
# However, this allows a trivial solution to the current
|
||||
# learning problem: if words are either 'any tag' or 'ANIMAL',
|
||||
# the model can learn that all words can be tagged 'ANIMAL'.
|
||||
#for i in range(len(gold.ner)):
|
||||
#if not gold.ner[i].endswith('ANIMAL'):
|
||||
# gold.ner[i] = '-'
|
||||
doc = nlp.make_doc(raw_text)
|
||||
nlp.tagger(doc)
|
||||
# As of 1.9, spaCy's parser now lets you supply a dropout probability
|
||||
# This might help the model generalize better from only a few
|
||||
# examples.
|
||||
loss += nlp.entity.update(doc, gold, drop=0.9)
|
||||
if loss == 0:
|
||||
break
|
||||
# This step averages the model's weights. This may or may not be good for
|
||||
# your situation --- it's empirical.
|
||||
nlp.end_training()
|
||||
if output_dir:
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.save_to_directory(output_dir)
|
||||
optimizer = nlp.begin_training(lambda: [])
|
||||
nlp.meta['name'] = 'en_ent_animal'
|
||||
for itn in range(50):
|
||||
losses = {}
|
||||
for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
|
||||
drop=0.35)
|
||||
print(losses)
|
||||
if not output_dir:
|
||||
return
|
||||
elif not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
|
||||
def main(model_name, output_directory=None):
|
||||
print("Loading initial model", model_name)
|
||||
nlp = spacy.load(model_name)
|
||||
print("Creating initial model", model_name)
|
||||
nlp = spacy.blank(model_name)
|
||||
if output_directory is not None:
|
||||
output_directory = Path(output_directory)
|
||||
|
||||
|
@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
|
|||
"Horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')],
|
||||
),
|
||||
(
|
||||
"Do they bite?",
|
||||
[],
|
||||
),
|
||||
|
||||
(
|
||||
"horses are too tall and they pretend to care about your feelings",
|
||||
[(0, 6, 'ANIMAL')]
|
||||
|
@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
|
|||
)
|
||||
|
||||
]
|
||||
nlp.entity.add_label('ANIMAL')
|
||||
nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
|
||||
nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
|
||||
nlp.pipeline[-1].add_label('ANIMAL')
|
||||
train_ner(nlp, train_data, output_directory)
|
||||
|
||||
# Test that the entity is recognized
|
||||
doc = nlp('Do you like horses?')
|
||||
text = 'Do you like horses?'
|
||||
print("Ents in 'Do you like horses?':")
|
||||
doc = nlp(text)
|
||||
for ent in doc.ents:
|
||||
print(ent.label_, ent.text)
|
||||
if output_directory:
|
||||
print("Loading from", output_directory)
|
||||
nlp2 = spacy.load('en', path=output_directory)
|
||||
nlp2.entity.add_label('ANIMAL')
|
||||
nlp2 = spacy.load(output_directory)
|
||||
doc2 = nlp2('Do you like horses?')
|
||||
for ent in doc2.ents:
|
||||
print(ent.label_, ent.text)
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
cython<0.24
|
||||
cython>=0.24
|
||||
pathlib
|
||||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.8.0,<6.9.0
|
||||
thinc>=6.8.1,<6.9.0
|
||||
murmurhash>=0.28,<0.29
|
||||
plac<1.0.0,>=0.9.6
|
||||
six
|
||||
|
|
2
setup.py
2
setup.py
|
@ -195,7 +195,7 @@ def setup_package():
|
|||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.8.0,<6.9.0',
|
||||
'thinc>=6.8.1,<6.9.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pip>=9.0.0,<10.0.0',
|
||||
'six',
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||
|
||||
__title__ = 'spacy-nightly'
|
||||
__version__ = '2.0.0a13'
|
||||
__version__ = '2.0.0a14'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Explosion AI'
|
||||
|
|
|
@ -80,6 +80,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
|||
n_train_words = corpus.count_train()
|
||||
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||
try:
|
||||
|
|
|
@ -7,6 +7,7 @@ import re
|
|||
import ujson
|
||||
import random
|
||||
import cytoolz
|
||||
import itertools
|
||||
|
||||
from .syntax import nonproj
|
||||
from .util import ensure_path
|
||||
|
@ -146,9 +147,13 @@ def minibatch(items, size=8):
|
|||
'''Iterate over batches of items. `size` may be an iterator,
|
||||
so that batch-size can vary on each step.
|
||||
'''
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(8)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size) #if hasattr(size, '__next__') else size
|
||||
batch_size = next(size_)
|
||||
batch = list(cytoolz.take(int(batch_size), items))
|
||||
if len(batch) == 0:
|
||||
break
|
||||
|
|
|
@ -14,8 +14,8 @@ class Chinese(Language):
|
|||
except ImportError:
|
||||
raise ImportError("The Chinese tokenizer requires the Jieba library: "
|
||||
"https://github.com/fxsjy/jieba")
|
||||
words = list(jieba.cut(text, cut_all=True))
|
||||
words=[x for x in words if x]
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
|
|
|
@ -346,15 +346,9 @@ class Language(object):
|
|||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
get_gold_tuples (function): Function returning gold data
|
||||
**cfg: Config parameters.
|
||||
YIELDS (tuple): A trainer and an optimizer.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
returns: An optimizer
|
||||
"""
|
||||
if self.parser:
|
||||
self.pipeline.append(NeuralLabeller(self.vocab))
|
||||
|
|
|
@ -38,7 +38,8 @@ class Lemmatizer(object):
|
|||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
others = [key for key in morphology
|
||||
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing':
|
||||
return True
|
||||
|
@ -47,7 +48,9 @@ class Lemmatizer(object):
|
|||
# This maps 'VBP' to base form -- probably just need 'IS_BASE'
|
||||
# morphology
|
||||
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
|
||||
morphology.get('Tense') == 'pres'):
|
||||
morphology.get('Tense') == 'pres' and \
|
||||
morphology.get('Number') is None and \
|
||||
not others):
|
||||
return True
|
||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||
return True
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
cpdef enum symbol_t:
|
||||
cdef enum symbol_t:
|
||||
NIL
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# coding: utf8
|
||||
#cython: optimize.unpack_method_calls=False
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
IDS = {
|
||||
|
@ -458,4 +460,11 @@ IDS = {
|
|||
"xcomp": xcomp
|
||||
}
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
|
||||
def sort_nums(x):
|
||||
return x[1]
|
||||
|
||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
|
||||
# Unfortunate hack here, to work around problem with long cpdef enum
|
||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
|
||||
# We keep the enum cdef, and just make sure the names are available to Python
|
||||
locals().update(IDS)
|
||||
|
|
|
@ -121,6 +121,8 @@ cdef cppclass StateC:
|
|||
for i in range(n):
|
||||
if ids[i] >= 0:
|
||||
ids[i] += this.offset
|
||||
else:
|
||||
ids[i] = -1
|
||||
|
||||
int S(int i) nogil const:
|
||||
if i >= this._s_i:
|
||||
|
@ -163,9 +165,9 @@ cdef cppclass StateC:
|
|||
|
||||
int E(int i) nogil const:
|
||||
if this._e_i <= 0 or this._e_i >= this.length:
|
||||
return 0
|
||||
return -1
|
||||
if i < 0 or i >= this._e_i:
|
||||
return 0
|
||||
return -1
|
||||
return this._ents[this._e_i - (i+1)].start
|
||||
|
||||
int L(int i, int idx) nogil const:
|
||||
|
|
|
@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
move_str = 'M'
|
||||
label = 0
|
||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||
elif name == '!O':
|
||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||
elif '-' in name:
|
||||
|
@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
raise Exception(move)
|
||||
return t
|
||||
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, (int, long)):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
if action == OUT and label_id != 0:
|
||||
return
|
||||
if action == MISSING or action == ISNT:
|
||||
return
|
||||
# Check we're not creating a move we already have, so that this is
|
||||
# idempotent
|
||||
for trans in self.c[:self.n_moves]:
|
||||
if trans.move == action and trans.label == label_id:
|
||||
return 0
|
||||
if self.n_moves >= self._size:
|
||||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
||||
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
# This is especially necessary when we use limited training data.
|
||||
for i in range(st.length):
|
||||
|
|
|
@ -262,8 +262,8 @@ cdef class Parser:
|
|||
upper.is_noop = True
|
||||
else:
|
||||
upper = chain(
|
||||
clone(Maxout(hidden_width), (depth-1)),
|
||||
zero_init(Affine(nr_class, drop_factor=0.0))
|
||||
clone(Maxout(hidden_width), depth-1),
|
||||
zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
|
||||
)
|
||||
upper.is_noop = False
|
||||
# TODO: This is an unfortunate hack atm!
|
||||
|
@ -395,7 +395,6 @@ cdef class Parser:
|
|||
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||
else:
|
||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
|
@ -421,7 +420,7 @@ cdef class Parser:
|
|||
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
|
||||
while not next_step.empty():
|
||||
if not has_hidden:
|
||||
for i in cython.parallel.prange(
|
||||
for i in range(
|
||||
next_step.size(), num_threads=6, nogil=True):
|
||||
self._parse_step(next_step[i],
|
||||
feat_weights, nr_class, nr_feat, nr_piece)
|
||||
|
@ -528,7 +527,6 @@ cdef class Parser:
|
|||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
docs, tokvec_lists = docs_tokvecs
|
||||
tokvecs = self.model[0].ops.flatten(tokvec_lists)
|
||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
|
@ -609,7 +607,7 @@ cdef class Parser:
|
|||
assert min(lengths) >= 1
|
||||
if USE_FINE_TUNE:
|
||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
||||
tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||
else:
|
||||
tokvecs = self.model[0].ops.flatten(tokvecs)
|
||||
states = self.moves.init_batch(docs)
|
||||
|
@ -647,6 +645,15 @@ cdef class Parser:
|
|||
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
def _pad_tokvecs(self, tokvecs):
|
||||
# Add a vector for missing values at the start of tokvecs
|
||||
xp = get_array_module(tokvecs)
|
||||
pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
|
||||
return xp.vstack((pad, tokvecs))
|
||||
|
||||
def _unpad_tokvecs(self, d_tokvecs):
|
||||
return d_tokvecs[1:]
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
|
|
@ -148,7 +148,7 @@ cdef class TransitionSystem:
|
|||
|
||||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int):
|
||||
if not isinstance(label_name, (int, long)):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
|
|
8
spacy/tests/regression/test_issue1305.py
Normal file
8
spacy/tests/regression/test_issue1305.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_issue1305(EN):
|
||||
'''Test lemmatization of English VBZ'''
|
||||
assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
|
||||
doc = EN(u'This app works well')
|
||||
assert doc[2].lemma_ == 'work'
|
|
@ -9,11 +9,14 @@ import pytest
|
|||
@pytest.mark.models('en')
|
||||
def test_issue429(EN):
|
||||
def merge_phrases(matcher, doc, i, matches):
|
||||
if i != len(matches) - 1:
|
||||
return None
|
||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
|
||||
if i != len(matches) - 1:
|
||||
return None
|
||||
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge(
|
||||
tag=('NNP' if label else span.root.tag_),
|
||||
lemma=span.text,
|
||||
label='PERSON')
|
||||
|
||||
doc = EN('a')
|
||||
matcher = Matcher(EN.vocab)
|
||||
|
|
|
@ -6,6 +6,16 @@ from ...strings import StringStore
|
|||
import pytest
|
||||
|
||||
|
||||
def test_string_hash(stringstore):
|
||||
'''Test that string hashing is stable across platforms'''
|
||||
ss = stringstore
|
||||
assert ss.add('apple') == 8566208034543834098
|
||||
heart = '\U0001f499'
|
||||
print(heart)
|
||||
h = ss.add(heart)
|
||||
assert h == 11841826740069053588
|
||||
|
||||
|
||||
def test_stringstore_from_api_docs(stringstore):
|
||||
apple_hash = stringstore.add('apple')
|
||||
assert apple_hash == 8566208034543834098
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
|||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
# These break on narrow unicode builds, e.g. Windows
|
||||
if sys.maxunicode >= 1114111:
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -17,6 +17,7 @@ fi
|
|||
|
||||
if [ "${VIA}" == "compile" ]; then
|
||||
pip install -r requirements.txt
|
||||
python setup.py build_ext --inplace
|
||||
pip install -e .
|
||||
fi
|
||||
|
||||
|
|
|
@ -282,7 +282,7 @@ p
|
|||
def __call__(self, text):
|
||||
words = text.split(' ')
|
||||
# All tokens 'own' a subsequent space character in this tokenizer
|
||||
spaces = [True] * len(word)
|
||||
spaces = [True] * len(words)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
p
|
||||
|
|
Loading…
Reference in New Issue
Block a user