mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Resolve fine-tuning conflict
This commit is contained in:
		
						commit
						43210abacc
					
				| 
						 | 
				
			
			@ -1 +1,56 @@
 | 
			
		|||
environment:
 | 
			
		||||
 | 
			
		||||
  matrix:
 | 
			
		||||
 | 
			
		||||
    # For Python versions available on Appveyor, see
 | 
			
		||||
    # http://www.appveyor.com/docs/installed-software#python
 | 
			
		||||
    # The list here is complete (excluding Python 2.6, which
 | 
			
		||||
    # isn't covered by this document) at the time of writing.
 | 
			
		||||
 | 
			
		||||
    - PYTHON: "C:\\Python27"
 | 
			
		||||
    #- PYTHON: "C:\\Python33"
 | 
			
		||||
    #- PYTHON: "C:\\Python34"
 | 
			
		||||
    #- PYTHON: "C:\\Python35"
 | 
			
		||||
    #- PYTHON: "C:\\Python27-x64"
 | 
			
		||||
    #- PYTHON: "C:\\Python33-x64"
 | 
			
		||||
    #- DISTUTILS_USE_SDK: "1"
 | 
			
		||||
    #- PYTHON: "C:\\Python34-x64"
 | 
			
		||||
    #- DISTUTILS_USE_SDK: "1"
 | 
			
		||||
    #- PYTHON: "C:\\Python35-x64"
 | 
			
		||||
    - PYTHON: "C:\\Python36-x64"
 | 
			
		||||
 | 
			
		||||
install:
 | 
			
		||||
  # We need wheel installed to build wheels
 | 
			
		||||
  - "%PYTHON%\\python.exe -m pip install wheel"
 | 
			
		||||
  - "%PYTHON%\\python.exe -m pip install cython"
 | 
			
		||||
  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
 | 
			
		||||
  - "%PYTHON%\\python.exe setup.py build_ext --inplace"
 | 
			
		||||
  - "%PYTHON%\\python.exe -m pip install -e ."
 | 
			
		||||
 | 
			
		||||
build: off
 | 
			
		||||
 | 
			
		||||
test_script:
 | 
			
		||||
  # Put your test command here.
 | 
			
		||||
  # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4,
 | 
			
		||||
  # you can remove "build.cmd" from the front of the command, as it's
 | 
			
		||||
  # only needed to support those cases.
 | 
			
		||||
  # Note that you must use the environment variable %PYTHON% to refer to
 | 
			
		||||
  # the interpreter you're using - Appveyor does not do anything special
 | 
			
		||||
  # to put the Python version you want to use on PATH.
 | 
			
		||||
  - "%PYTHON%\\python.exe -m pytest spacy/"
 | 
			
		||||
 | 
			
		||||
after_test:
 | 
			
		||||
  # This step builds your wheels.
 | 
			
		||||
  # Again, you only need build.cmd if you're building C extensions for
 | 
			
		||||
  # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct
 | 
			
		||||
  # interpreter
 | 
			
		||||
  - "%PYTHON%\\python.exe setup.py bdist_wheel"
 | 
			
		||||
 | 
			
		||||
artifacts:
 | 
			
		||||
  # bdist_wheel puts your built wheel in the dist directory
 | 
			
		||||
  - path: dist\*
 | 
			
		||||
 | 
			
		||||
#on_success:
 | 
			
		||||
#  You can use this step to upload your artifacts to a public website.
 | 
			
		||||
#  See Appveyor's documentation for more details. Or you can simply
 | 
			
		||||
#  access your wheels from the Appveyor "artifacts" tab for your build.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,24 +13,27 @@ Input data:
 | 
			
		|||
https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 | 
			
		||||
 | 
			
		||||
Developed for: spaCy 1.7.1
 | 
			
		||||
Last tested for: spaCy 1.7.1
 | 
			
		||||
Last tested for: spaCy 2.0.0a13
 | 
			
		||||
'''
 | 
			
		||||
from __future__ import unicode_literals, print_function
 | 
			
		||||
import plac
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import random
 | 
			
		||||
import json
 | 
			
		||||
from thinc.neural.optimizers import Adam
 | 
			
		||||
from thinc.neural.ops import NumpyOps
 | 
			
		||||
import tqdm
 | 
			
		||||
 | 
			
		||||
import spacy.orth as orth_funcs
 | 
			
		||||
from spacy.vocab import Vocab
 | 
			
		||||
from spacy.pipeline import BeamEntityRecognizer
 | 
			
		||||
from spacy.pipeline import EntityRecognizer
 | 
			
		||||
from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 | 
			
		||||
from spacy.tokenizer import Tokenizer
 | 
			
		||||
from spacy.tokens import Doc
 | 
			
		||||
from spacy.attrs import *
 | 
			
		||||
from spacy.gold import GoldParse
 | 
			
		||||
from spacy.gold import _iob_to_biluo as iob_to_biluo
 | 
			
		||||
from spacy.gold import iob_to_biluo
 | 
			
		||||
from spacy.gold import minibatch
 | 
			
		||||
from spacy.scorer import Scorer
 | 
			
		||||
import spacy.util
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    unicode
 | 
			
		||||
| 
						 | 
				
			
			@ -38,95 +41,40 @@ except NameError:
 | 
			
		|||
    unicode = str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
spacy.util.set_env_log(True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def init_vocab():
 | 
			
		||||
    return Vocab(
 | 
			
		||||
        lex_attr_getters={
 | 
			
		||||
            LOWER: lambda string: string.lower(),
 | 
			
		||||
            SHAPE: orth_funcs.word_shape,
 | 
			
		||||
            NORM: lambda string: string.lower(),
 | 
			
		||||
            PREFIX: lambda string: string[0],
 | 
			
		||||
            SUFFIX: lambda string: string[-3:],
 | 
			
		||||
            CLUSTER: lambda string: 0,
 | 
			
		||||
            IS_ALPHA: orth_funcs.is_alpha,
 | 
			
		||||
            IS_ASCII: orth_funcs.is_ascii,
 | 
			
		||||
            IS_DIGIT: lambda string: string.isdigit(),
 | 
			
		||||
            IS_LOWER: orth_funcs.is_lower,
 | 
			
		||||
            IS_PUNCT: orth_funcs.is_punct,
 | 
			
		||||
            IS_SPACE: lambda string: string.isspace(),
 | 
			
		||||
            IS_TITLE: orth_funcs.is_title,
 | 
			
		||||
            IS_UPPER: orth_funcs.is_upper,
 | 
			
		||||
            IS_STOP: lambda string: False,
 | 
			
		||||
            IS_OOV: lambda string: True
 | 
			
		||||
        })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_vocab(vocab, path):
 | 
			
		||||
    path = Path(path)
 | 
			
		||||
    if not path.exists():
 | 
			
		||||
        path.mkdir()
 | 
			
		||||
    elif not path.is_dir():
 | 
			
		||||
        raise IOError("Can't save vocab to %s\nNot a directory" % path)
 | 
			
		||||
    with (path / 'strings.json').open('w') as file_:
 | 
			
		||||
        vocab.strings.dump(file_)
 | 
			
		||||
    vocab.dump((path / 'lexemes.bin').as_posix())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_vocab(path):
 | 
			
		||||
    path = Path(path)
 | 
			
		||||
    if not path.exists():
 | 
			
		||||
        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
 | 
			
		||||
    if not path.is_dir():
 | 
			
		||||
        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
 | 
			
		||||
    return Vocab.load(path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def init_ner_model(vocab, features=None):
 | 
			
		||||
    if features is None:
 | 
			
		||||
        features = tuple(EntityRecognizer.feature_templates)
 | 
			
		||||
    return EntityRecognizer(vocab, features=features)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_ner_model(model, path):
 | 
			
		||||
    path = Path(path)
 | 
			
		||||
    if not path.exists():
 | 
			
		||||
        path.mkdir()
 | 
			
		||||
    if not path.is_dir():
 | 
			
		||||
        raise IOError("Can't save model to %s\nNot a directory" % path)
 | 
			
		||||
    model.model.dump((path / 'model').as_posix())
 | 
			
		||||
    with (path / 'config.json').open('w') as file_:
 | 
			
		||||
        data = json.dumps(model.cfg)
 | 
			
		||||
        if not isinstance(data, unicode):
 | 
			
		||||
            data = data.decode('utf8')
 | 
			
		||||
        file_.write(data)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_ner_model(vocab, path):
 | 
			
		||||
    return EntityRecognizer.load(path, vocab)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Pipeline(object):
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def load(cls, path):
 | 
			
		||||
        path = Path(path)
 | 
			
		||||
        if not path.exists():
 | 
			
		||||
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
 | 
			
		||||
        if not path.is_dir():
 | 
			
		||||
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
 | 
			
		||||
        vocab = load_vocab(path)
 | 
			
		||||
        tokenizer = Tokenizer(vocab, {}, None, None, None)
 | 
			
		||||
        ner_model = load_ner_model(vocab, path / 'ner')
 | 
			
		||||
        return cls(vocab, tokenizer, ner_model)
 | 
			
		||||
 | 
			
		||||
    def __init__(self, vocab=None, tokenizer=None, entity=None):
 | 
			
		||||
    def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
 | 
			
		||||
        if vocab is None:
 | 
			
		||||
            vocab = init_vocab()
 | 
			
		||||
        if tokenizer is None:
 | 
			
		||||
            tokenizer = Tokenizer(vocab, {}, None, None, None)
 | 
			
		||||
        if tensorizer is None:
 | 
			
		||||
            tensorizer = TokenVectorEncoder(vocab)
 | 
			
		||||
        if entity is None:
 | 
			
		||||
            entity = init_ner_model(self.vocab)
 | 
			
		||||
            entity = NeuralEntityRecognizer(vocab)
 | 
			
		||||
        self.vocab = vocab
 | 
			
		||||
        self.tokenizer = tokenizer
 | 
			
		||||
        self.tensorizer = tensorizer
 | 
			
		||||
        self.entity = entity
 | 
			
		||||
        self.pipeline = [self.entity]
 | 
			
		||||
        self.pipeline = [tensorizer, self.entity]
 | 
			
		||||
 | 
			
		||||
    def begin_training(self):
 | 
			
		||||
        for model in self.pipeline:
 | 
			
		||||
            model.begin_training([])
 | 
			
		||||
        optimizer = Adam(NumpyOps(), 0.001)
 | 
			
		||||
        return optimizer
 | 
			
		||||
 | 
			
		||||
    def __call__(self, input_):
 | 
			
		||||
        doc = self.make_doc(input_)
 | 
			
		||||
| 
						 | 
				
			
			@ -147,14 +95,18 @@ class Pipeline(object):
 | 
			
		|||
        gold = GoldParse(doc, entities=annotations)
 | 
			
		||||
        return gold
 | 
			
		||||
 | 
			
		||||
    def update(self, input_, annot):
 | 
			
		||||
        doc = self.make_doc(input_)
 | 
			
		||||
        gold = self.make_gold(input_, annot)
 | 
			
		||||
        for ner in gold.ner:
 | 
			
		||||
            if ner not in (None, '-', 'O'):
 | 
			
		||||
                action, label = ner.split('-', 1)
 | 
			
		||||
                self.entity.add_label(label)
 | 
			
		||||
        return self.entity.update(doc, gold)
 | 
			
		||||
    def update(self, inputs, annots, sgd, losses=None, drop=0.):
 | 
			
		||||
        if losses is None:
 | 
			
		||||
            losses = {}
 | 
			
		||||
        docs = [self.make_doc(input_) for input_ in inputs]
 | 
			
		||||
        golds = [self.make_gold(input_, annot) for input_, annot in
 | 
			
		||||
                 zip(inputs, annots)]
 | 
			
		||||
 | 
			
		||||
        tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
 | 
			
		||||
        d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
 | 
			
		||||
                                      sgd=sgd, losses=losses)
 | 
			
		||||
        bp_tensors(d_tensors, sgd=sgd)
 | 
			
		||||
        return losses
 | 
			
		||||
 | 
			
		||||
    def evaluate(self, examples):
 | 
			
		||||
        scorer = Scorer()
 | 
			
		||||
| 
						 | 
				
			
			@ -164,34 +116,38 @@ class Pipeline(object):
 | 
			
		|||
            scorer.score(doc, gold)
 | 
			
		||||
        return scorer.scores
 | 
			
		||||
 | 
			
		||||
    def average_weights(self):
 | 
			
		||||
        self.entity.model.end_training()
 | 
			
		||||
 | 
			
		||||
    def save(self, path):
 | 
			
		||||
    def to_disk(self, path):
 | 
			
		||||
        path = Path(path)
 | 
			
		||||
        if not path.exists():
 | 
			
		||||
            path.mkdir()
 | 
			
		||||
        elif not path.is_dir():
 | 
			
		||||
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
 | 
			
		||||
        save_vocab(self.vocab, path / 'vocab')
 | 
			
		||||
        save_ner_model(self.entity, path / 'ner')
 | 
			
		||||
        self.vocab.to_disk(path / 'vocab')
 | 
			
		||||
        self.tensorizer.to_disk(path / 'tensorizer')
 | 
			
		||||
        self.entity.to_disk(path / 'ner')
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path):
 | 
			
		||||
        path = Path(path)
 | 
			
		||||
        if not path.exists():
 | 
			
		||||
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
 | 
			
		||||
        if not path.is_dir():
 | 
			
		||||
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
 | 
			
		||||
        self.vocab = self.vocab.from_disk(path / 'vocab')
 | 
			
		||||
        self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
 | 
			
		||||
        self.entity = self.entity.from_disk(path / 'ner')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
 | 
			
		||||
    next_epoch = train_examples
 | 
			
		||||
def train(nlp, train_examples, dev_examples, nr_epoch=5):
 | 
			
		||||
    sgd = nlp.begin_training()
 | 
			
		||||
    print("Iter", "Loss", "P", "R", "F")
 | 
			
		||||
    for i in range(nr_epoch):
 | 
			
		||||
        this_epoch = next_epoch
 | 
			
		||||
        next_epoch = []
 | 
			
		||||
        loss = 0
 | 
			
		||||
        for input_, annot in this_epoch:
 | 
			
		||||
            loss += nlp.update(input_, annot)
 | 
			
		||||
            if (i+1) < nr_epoch:
 | 
			
		||||
                next_epoch.append((input_, annot))
 | 
			
		||||
        random.shuffle(next_epoch)
 | 
			
		||||
        random.shuffle(train_examples)
 | 
			
		||||
        losses = {}
 | 
			
		||||
        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
 | 
			
		||||
            inputs, annots = zip(*batch)
 | 
			
		||||
            nlp.update(list(inputs), list(annots), sgd, losses=losses)
 | 
			
		||||
        scores = nlp.evaluate(dev_examples)
 | 
			
		||||
        report_scores(i, loss, scores)
 | 
			
		||||
    nlp.average_weights()
 | 
			
		||||
        report_scores(i, losses['ner'], scores)
 | 
			
		||||
    scores = nlp.evaluate(dev_examples)
 | 
			
		||||
    report_scores(channels, i+1, loss, scores)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -208,7 +164,8 @@ def read_examples(path):
 | 
			
		|||
    with path.open() as file_:
 | 
			
		||||
        sents = file_.read().strip().split('\n\n')
 | 
			
		||||
        for sent in sents:
 | 
			
		||||
            if not sent.strip():
 | 
			
		||||
            sent = sent.strip()
 | 
			
		||||
            if not sent:
 | 
			
		||||
                continue
 | 
			
		||||
            tokens = sent.split('\n')
 | 
			
		||||
            while tokens and tokens[0].startswith('#'):
 | 
			
		||||
| 
						 | 
				
			
			@ -217,28 +174,39 @@ def read_examples(path):
 | 
			
		|||
            iob = []
 | 
			
		||||
            for token in tokens:
 | 
			
		||||
                if token.strip():
 | 
			
		||||
                    pieces = token.split()
 | 
			
		||||
                    pieces = token.split('\t')
 | 
			
		||||
                    words.append(pieces[1])
 | 
			
		||||
                    iob.append(pieces[2])
 | 
			
		||||
            yield words, iob_to_biluo(iob)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_labels(examples):
 | 
			
		||||
    labels = set()
 | 
			
		||||
    for words, tags in examples:
 | 
			
		||||
        for tag in tags:
 | 
			
		||||
            if '-' in tag:
 | 
			
		||||
                labels.add(tag.split('-')[1])
 | 
			
		||||
    return sorted(labels)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    model_dir=("Path to save the model", "positional", None, Path),
 | 
			
		||||
    train_loc=("Path to your training data", "positional", None, Path),
 | 
			
		||||
    dev_loc=("Path to your development data", "positional", None, Path),
 | 
			
		||||
)
 | 
			
		||||
def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
 | 
			
		||||
        train_loc=None, dev_loc=None, nr_epoch=30):
 | 
			
		||||
    
 | 
			
		||||
    train_examples = read_examples(train_loc)
 | 
			
		||||
def main(model_dir, train_loc, dev_loc, nr_epoch=30):
 | 
			
		||||
    print(model_dir, train_loc, dev_loc)
 | 
			
		||||
    train_examples = list(read_examples(train_loc))
 | 
			
		||||
    dev_examples = read_examples(dev_loc)
 | 
			
		||||
    nlp = Pipeline.load(model_dir)
 | 
			
		||||
    nlp = Pipeline()
 | 
			
		||||
    for label in get_labels(train_examples):
 | 
			
		||||
        nlp.entity.add_label(label)
 | 
			
		||||
        print("Add label", label)
 | 
			
		||||
 | 
			
		||||
    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
 | 
			
		||||
    train(nlp, train_examples, list(dev_examples), nr_epoch)
 | 
			
		||||
 | 
			
		||||
    nlp.save(model_dir)
 | 
			
		||||
    nlp.to_disk(model_dir)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    main()
 | 
			
		||||
    plac.call(main)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,7 +25,7 @@ For more details, see the documentation:
 | 
			
		|||
* Saving and loading models: https://spacy.io/docs/usage/saving-loading
 | 
			
		||||
 | 
			
		||||
Developed for: spaCy 1.7.6
 | 
			
		||||
Last tested for: spaCy 1.7.6
 | 
			
		||||
Last updated for: spaCy 2.0.0a13
 | 
			
		||||
"""
 | 
			
		||||
from __future__ import unicode_literals, print_function
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -34,55 +34,41 @@ from pathlib import Path
 | 
			
		|||
import random
 | 
			
		||||
 | 
			
		||||
import spacy
 | 
			
		||||
from spacy.gold import GoldParse
 | 
			
		||||
from spacy.tagger import Tagger
 | 
			
		||||
from spacy.gold import GoldParse, minibatch
 | 
			
		||||
from spacy.pipeline import NeuralEntityRecognizer
 | 
			
		||||
from spacy.pipeline import TokenVectorEncoder
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_gold_parses(tokenizer, train_data):
 | 
			
		||||
    '''Shuffle and create GoldParse objects'''
 | 
			
		||||
    random.shuffle(train_data)
 | 
			
		||||
    for raw_text, entity_offsets in train_data:
 | 
			
		||||
        doc = tokenizer(raw_text)
 | 
			
		||||
        gold = GoldParse(doc, entities=entity_offsets)
 | 
			
		||||
        yield doc, gold
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
def train_ner(nlp, train_data, output_dir):
 | 
			
		||||
    # Add new words to vocab
 | 
			
		||||
    for raw_text, _ in train_data:
 | 
			
		||||
        doc = nlp.make_doc(raw_text)
 | 
			
		||||
        for word in doc:
 | 
			
		||||
            _ = nlp.vocab[word.orth]
 | 
			
		||||
    random.seed(0)
 | 
			
		||||
    # You may need to change the learning rate. It's generally difficult to
 | 
			
		||||
    # guess what rate you should set, especially when you have limited data.
 | 
			
		||||
    nlp.entity.model.learn_rate = 0.001
 | 
			
		||||
    for itn in range(1000):
 | 
			
		||||
        random.shuffle(train_data)
 | 
			
		||||
        loss = 0.
 | 
			
		||||
        for raw_text, entity_offsets in train_data:
 | 
			
		||||
            gold = GoldParse(doc, entities=entity_offsets)
 | 
			
		||||
            # By default, the GoldParse class assumes that the entities
 | 
			
		||||
            # described by offset are complete, and all other words should
 | 
			
		||||
            # have the tag 'O'. You can tell it to make no assumptions
 | 
			
		||||
            # about the tag of a word by giving it the tag '-'.
 | 
			
		||||
            # However, this allows a trivial solution to the current
 | 
			
		||||
            # learning problem: if words are either 'any tag' or 'ANIMAL',
 | 
			
		||||
            # the model can learn that all words can be tagged 'ANIMAL'.
 | 
			
		||||
            #for i in range(len(gold.ner)):
 | 
			
		||||
                #if not gold.ner[i].endswith('ANIMAL'):
 | 
			
		||||
                #    gold.ner[i] = '-'
 | 
			
		||||
            doc = nlp.make_doc(raw_text)
 | 
			
		||||
            nlp.tagger(doc)
 | 
			
		||||
            # As of 1.9, spaCy's parser now lets you supply a dropout probability
 | 
			
		||||
            # This might help the model generalize better from only a few
 | 
			
		||||
            # examples.
 | 
			
		||||
            loss += nlp.entity.update(doc, gold, drop=0.9)
 | 
			
		||||
        if loss == 0:
 | 
			
		||||
            break
 | 
			
		||||
    # This step averages the model's weights. This may or may not be good for
 | 
			
		||||
    # your situation --- it's empirical.
 | 
			
		||||
    nlp.end_training()
 | 
			
		||||
    if output_dir:
 | 
			
		||||
        if not output_dir.exists():
 | 
			
		||||
            output_dir.mkdir()
 | 
			
		||||
        nlp.save_to_directory(output_dir)
 | 
			
		||||
    optimizer = nlp.begin_training(lambda: [])
 | 
			
		||||
    nlp.meta['name'] = 'en_ent_animal'
 | 
			
		||||
    for itn in range(50):
 | 
			
		||||
        losses = {}
 | 
			
		||||
        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
 | 
			
		||||
            docs, golds = zip(*batch)
 | 
			
		||||
            nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
 | 
			
		||||
                       drop=0.35)
 | 
			
		||||
        print(losses)
 | 
			
		||||
    if not output_dir:
 | 
			
		||||
        return
 | 
			
		||||
    elif not output_dir.exists():
 | 
			
		||||
        output_dir.mkdir()
 | 
			
		||||
    nlp.to_disk(output_dir)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(model_name, output_directory=None):
 | 
			
		||||
    print("Loading initial model", model_name)
 | 
			
		||||
    nlp = spacy.load(model_name)
 | 
			
		||||
    print("Creating initial model", model_name)
 | 
			
		||||
    nlp = spacy.blank(model_name)
 | 
			
		||||
    if output_directory is not None:
 | 
			
		||||
        output_directory = Path(output_directory)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -91,6 +77,11 @@ def main(model_name, output_directory=None):
 | 
			
		|||
            "Horses are too tall and they pretend to care about your feelings",
 | 
			
		||||
            [(0, 6, 'ANIMAL')],
 | 
			
		||||
        ),
 | 
			
		||||
        (
 | 
			
		||||
            "Do they bite?", 
 | 
			
		||||
            [],
 | 
			
		||||
        ),
 | 
			
		||||
 
 | 
			
		||||
        (
 | 
			
		||||
            "horses are too tall and they pretend to care about your feelings",
 | 
			
		||||
            [(0, 6, 'ANIMAL')]
 | 
			
		||||
| 
						 | 
				
			
			@ -109,18 +100,20 @@ def main(model_name, output_directory=None):
 | 
			
		|||
        )
 | 
			
		||||
 | 
			
		||||
    ]
 | 
			
		||||
    nlp.entity.add_label('ANIMAL')
 | 
			
		||||
    nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
 | 
			
		||||
    nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
 | 
			
		||||
    nlp.pipeline[-1].add_label('ANIMAL')
 | 
			
		||||
    train_ner(nlp, train_data, output_directory)
 | 
			
		||||
 | 
			
		||||
    # Test that the entity is recognized
 | 
			
		||||
    doc = nlp('Do you like horses?')
 | 
			
		||||
    text = 'Do you like horses?'
 | 
			
		||||
    print("Ents in 'Do you like horses?':")
 | 
			
		||||
    doc = nlp(text)
 | 
			
		||||
    for ent in doc.ents:
 | 
			
		||||
        print(ent.label_, ent.text)
 | 
			
		||||
    if output_directory:
 | 
			
		||||
        print("Loading from", output_directory)
 | 
			
		||||
        nlp2 = spacy.load('en', path=output_directory)
 | 
			
		||||
        nlp2.entity.add_label('ANIMAL')
 | 
			
		||||
        nlp2 = spacy.load(output_directory)
 | 
			
		||||
        doc2 = nlp2('Do you like horses?')
 | 
			
		||||
        for ent in doc2.ents:
 | 
			
		||||
            print(ent.label_, ent.text)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,9 +1,9 @@
 | 
			
		|||
cython<0.24
 | 
			
		||||
cython>=0.24
 | 
			
		||||
pathlib
 | 
			
		||||
numpy>=1.7
 | 
			
		||||
cymem>=1.30,<1.32
 | 
			
		||||
preshed>=1.0.0,<2.0.0
 | 
			
		||||
thinc>=6.8.0,<6.9.0
 | 
			
		||||
thinc>=6.8.1,<6.9.0
 | 
			
		||||
murmurhash>=0.28,<0.29
 | 
			
		||||
plac<1.0.0,>=0.9.6
 | 
			
		||||
six
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -195,7 +195,7 @@ def setup_package():
 | 
			
		|||
                'murmurhash>=0.28,<0.29',
 | 
			
		||||
                'cymem>=1.30,<1.32',
 | 
			
		||||
                'preshed>=1.0.0,<2.0.0',
 | 
			
		||||
                'thinc>=6.8.0,<6.9.0',
 | 
			
		||||
                'thinc>=6.8.1,<6.9.0',
 | 
			
		||||
                'plac<1.0.0,>=0.9.6',
 | 
			
		||||
                'pip>=9.0.0,<10.0.0',
 | 
			
		||||
                'six',
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,7 @@
 | 
			
		|||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 | 
			
		||||
 | 
			
		||||
__title__ = 'spacy-nightly'
 | 
			
		||||
__version__ = '2.0.0a13'
 | 
			
		||||
__version__ = '2.0.0a14'
 | 
			
		||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 | 
			
		||||
__uri__ = 'https://spacy.io'
 | 
			
		||||
__author__ = 'Explosion AI'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -80,6 +80,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
 | 
			
		|||
    n_train_words = corpus.count_train()
 | 
			
		||||
 | 
			
		||||
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
 | 
			
		||||
    nlp._optimizer = None
 | 
			
		||||
 | 
			
		||||
    print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
 | 
			
		||||
    try:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,6 +7,7 @@ import re
 | 
			
		|||
import ujson
 | 
			
		||||
import random
 | 
			
		||||
import cytoolz
 | 
			
		||||
import itertools
 | 
			
		||||
 | 
			
		||||
from .syntax import nonproj
 | 
			
		||||
from .util import ensure_path
 | 
			
		||||
| 
						 | 
				
			
			@ -146,9 +147,13 @@ def minibatch(items, size=8):
 | 
			
		|||
    '''Iterate over batches of items. `size` may be an iterator,
 | 
			
		||||
    so that batch-size can vary on each step.
 | 
			
		||||
    '''
 | 
			
		||||
    if isinstance(size, int):
 | 
			
		||||
        size_ = itertools.repeat(8)
 | 
			
		||||
    else:
 | 
			
		||||
        size_ = size
 | 
			
		||||
    items = iter(items)
 | 
			
		||||
    while True:
 | 
			
		||||
        batch_size = next(size) #if hasattr(size, '__next__') else size
 | 
			
		||||
        batch_size = next(size_)
 | 
			
		||||
        batch = list(cytoolz.take(int(batch_size), items))
 | 
			
		||||
        if len(batch) == 0:
 | 
			
		||||
            break
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,8 +14,8 @@ class Chinese(Language):
 | 
			
		|||
        except ImportError:
 | 
			
		||||
            raise ImportError("The Chinese tokenizer requires the Jieba library: "
 | 
			
		||||
                              "https://github.com/fxsjy/jieba")
 | 
			
		||||
        words = list(jieba.cut(text, cut_all=True))
 | 
			
		||||
        words=[x for x in words if x]
 | 
			
		||||
        words = list(jieba.cut(text, cut_all=False))
 | 
			
		||||
        words = [x for x in words if x]
 | 
			
		||||
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -346,15 +346,9 @@ class Language(object):
 | 
			
		|||
        """Allocate models, pre-process training data and acquire a trainer and
 | 
			
		||||
        optimizer. Used as a contextmanager.
 | 
			
		||||
 | 
			
		||||
        gold_tuples (iterable): Gold-standard training data.
 | 
			
		||||
        get_gold_tuples (function): Function returning gold data
 | 
			
		||||
        **cfg: Config parameters.
 | 
			
		||||
        YIELDS (tuple): A trainer and an optimizer.
 | 
			
		||||
 | 
			
		||||
        EXAMPLE:
 | 
			
		||||
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
 | 
			
		||||
            >>>    for epoch in trainer.epochs(gold):
 | 
			
		||||
            >>>        for docs, golds in epoch:
 | 
			
		||||
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
 | 
			
		||||
        returns: An optimizer
 | 
			
		||||
        """
 | 
			
		||||
        if self.parser:
 | 
			
		||||
            self.pipeline.append(NeuralLabeller(self.vocab))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -38,7 +38,8 @@ class Lemmatizer(object):
 | 
			
		|||
        avoid lemmatization entirely.
 | 
			
		||||
        """
 | 
			
		||||
        morphology = {} if morphology is None else morphology
 | 
			
		||||
        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
 | 
			
		||||
        others = [key for key in morphology
 | 
			
		||||
                  if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
 | 
			
		||||
        true_morph_key = morphology.get('morph', 0)
 | 
			
		||||
        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
 | 
			
		||||
            return True
 | 
			
		||||
| 
						 | 
				
			
			@ -47,7 +48,9 @@ class Lemmatizer(object):
 | 
			
		|||
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 | 
			
		||||
        # morphology
 | 
			
		||||
        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
 | 
			
		||||
                                     morphology.get('Tense') == 'pres'):
 | 
			
		||||
                                     morphology.get('Tense') == 'pres' and \
 | 
			
		||||
                                     morphology.get('Number') is None and \
 | 
			
		||||
                                     not others):
 | 
			
		||||
            return True
 | 
			
		||||
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
 | 
			
		||||
            return True
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,4 @@
 | 
			
		|||
cpdef enum symbol_t:
 | 
			
		||||
cdef enum symbol_t:
 | 
			
		||||
    NIL
 | 
			
		||||
    IS_ALPHA
 | 
			
		||||
    IS_ASCII
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,4 +1,6 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
#cython: optimize.unpack_method_calls=False
 | 
			
		||||
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
IDS = {
 | 
			
		||||
| 
						 | 
				
			
			@ -458,4 +460,11 @@ IDS = {
 | 
			
		|||
    "xcomp": xcomp
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])]
 | 
			
		||||
def sort_nums(x):
 | 
			
		||||
    return x[1]
 | 
			
		||||
 | 
			
		||||
NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
 | 
			
		||||
# Unfortunate hack here, to work around problem with long cpdef enum
 | 
			
		||||
# (which is generating an enormous amount of C++ in Cython 0.24+)
 | 
			
		||||
# We keep the enum cdef, and just make sure the names are available to Python
 | 
			
		||||
locals().update(IDS)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -121,6 +121,8 @@ cdef cppclass StateC:
 | 
			
		|||
        for i in range(n):
 | 
			
		||||
            if ids[i] >= 0:
 | 
			
		||||
                ids[i] += this.offset
 | 
			
		||||
            else:
 | 
			
		||||
                ids[i] = -1
 | 
			
		||||
 | 
			
		||||
    int S(int i) nogil const:
 | 
			
		||||
        if i >= this._s_i:
 | 
			
		||||
| 
						 | 
				
			
			@ -163,9 +165,9 @@ cdef cppclass StateC:
 | 
			
		|||
 | 
			
		||||
    int E(int i) nogil const:
 | 
			
		||||
        if this._e_i <= 0 or this._e_i >= this.length:
 | 
			
		||||
            return 0
 | 
			
		||||
            return -1
 | 
			
		||||
        if i < 0 or i >= this._e_i:
 | 
			
		||||
            return 0
 | 
			
		||||
            return -1
 | 
			
		||||
        return this._ents[this._e_i - (i+1)].start
 | 
			
		||||
 | 
			
		||||
    int L(int i, int idx) nogil const:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
			
		|||
    cdef Transition lookup_transition(self, object name) except *:
 | 
			
		||||
        cdef attr_t label
 | 
			
		||||
        if name == '-' or name == None:
 | 
			
		||||
            move_str = 'M'
 | 
			
		||||
            label = 0
 | 
			
		||||
            return Transition(clas=0, move=MISSING, label=0, score=0)
 | 
			
		||||
        elif name == '!O':
 | 
			
		||||
            return Transition(clas=0, move=ISNT, label=0, score=0)
 | 
			
		||||
        elif '-' in name:
 | 
			
		||||
| 
						 | 
				
			
			@ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem):
 | 
			
		|||
            raise Exception(move)
 | 
			
		||||
        return t
 | 
			
		||||
 | 
			
		||||
    def add_action(self, int action, label_name):
 | 
			
		||||
        cdef attr_t label_id
 | 
			
		||||
        if not isinstance(label_name, (int, long)):
 | 
			
		||||
            label_id = self.strings.add(label_name)
 | 
			
		||||
        else:
 | 
			
		||||
            label_id = label_name
 | 
			
		||||
        if action == OUT and label_id != 0:
 | 
			
		||||
            return
 | 
			
		||||
        if action == MISSING or action == ISNT:
 | 
			
		||||
            return
 | 
			
		||||
        # Check we're not creating a move we already have, so that this is
 | 
			
		||||
        # idempotent
 | 
			
		||||
        for trans in self.c[:self.n_moves]:
 | 
			
		||||
            if trans.move == action and trans.label == label_id:
 | 
			
		||||
                return 0
 | 
			
		||||
        if self.n_moves >= self._size:
 | 
			
		||||
            self._size *= 2
 | 
			
		||||
            self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
 | 
			
		||||
        self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
 | 
			
		||||
        assert self.c[self.n_moves].label == label_id
 | 
			
		||||
        self.n_moves += 1
 | 
			
		||||
        return 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    cdef int initialize_state(self, StateC* st) nogil:
 | 
			
		||||
        # This is especially necessary when we use limited training data.
 | 
			
		||||
        for i in range(st.length):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -262,8 +262,8 @@ cdef class Parser:
 | 
			
		|||
                upper.is_noop = True
 | 
			
		||||
            else:
 | 
			
		||||
                upper = chain(
 | 
			
		||||
                    clone(Maxout(hidden_width), (depth-1)),
 | 
			
		||||
                    zero_init(Affine(nr_class, drop_factor=0.0))
 | 
			
		||||
                    clone(Maxout(hidden_width), depth-1),
 | 
			
		||||
                    zero_init(Affine(nr_class, hidden_width, drop_factor=0.0))
 | 
			
		||||
                )
 | 
			
		||||
                upper.is_noop = False
 | 
			
		||||
        # TODO: This is an unfortunate hack atm!
 | 
			
		||||
| 
						 | 
				
			
			@ -395,7 +395,6 @@ cdef class Parser:
 | 
			
		|||
            tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
 | 
			
		||||
        else:
 | 
			
		||||
            tokvecs = self.model[0].ops.flatten(tokvecses)
 | 
			
		||||
 | 
			
		||||
        nr_state = len(docs)
 | 
			
		||||
        nr_class = self.moves.n_moves
 | 
			
		||||
        nr_dim = tokvecs.shape[1]
 | 
			
		||||
| 
						 | 
				
			
			@ -421,7 +420,7 @@ cdef class Parser:
 | 
			
		|||
        cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
 | 
			
		||||
        while not next_step.empty():
 | 
			
		||||
            if not has_hidden:
 | 
			
		||||
                for i in cython.parallel.prange(
 | 
			
		||||
                for i in range(
 | 
			
		||||
                        next_step.size(), num_threads=6, nogil=True):
 | 
			
		||||
                    self._parse_step(next_step[i],
 | 
			
		||||
                        feat_weights, nr_class, nr_feat, nr_piece)
 | 
			
		||||
| 
						 | 
				
			
			@ -528,7 +527,6 @@ cdef class Parser:
 | 
			
		|||
        if losses is not None and self.name not in losses:
 | 
			
		||||
            losses[self.name] = 0.
 | 
			
		||||
        docs, tokvec_lists = docs_tokvecs
 | 
			
		||||
        tokvecs = self.model[0].ops.flatten(tokvec_lists)
 | 
			
		||||
        if isinstance(docs, Doc) and isinstance(golds, GoldParse):
 | 
			
		||||
            docs = [docs]
 | 
			
		||||
            golds = [golds]
 | 
			
		||||
| 
						 | 
				
			
			@ -609,7 +607,7 @@ cdef class Parser:
 | 
			
		|||
        assert min(lengths) >= 1
 | 
			
		||||
        if USE_FINE_TUNE:
 | 
			
		||||
            my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
 | 
			
		||||
            tokvecs += self.model[0].ops.flatten(my_tokvecs)
 | 
			
		||||
            tokvecs = self.model[0].ops.flatten(my_tokvecs)
 | 
			
		||||
        else:
 | 
			
		||||
            tokvecs = self.model[0].ops.flatten(tokvecs)
 | 
			
		||||
        states = self.moves.init_batch(docs)
 | 
			
		||||
| 
						 | 
				
			
			@ -647,6 +645,15 @@ cdef class Parser:
 | 
			
		|||
            d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
 | 
			
		||||
        return d_tokvecs
 | 
			
		||||
 | 
			
		||||
    def _pad_tokvecs(self, tokvecs):
 | 
			
		||||
        # Add a vector for missing values at the start of tokvecs
 | 
			
		||||
        xp = get_array_module(tokvecs)
 | 
			
		||||
        pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
 | 
			
		||||
        return xp.vstack((pad, tokvecs))
 | 
			
		||||
 | 
			
		||||
    def _unpad_tokvecs(self, d_tokvecs):
 | 
			
		||||
        return d_tokvecs[1:]
 | 
			
		||||
 | 
			
		||||
    def _init_gold_batch(self, whole_docs, whole_golds):
 | 
			
		||||
        """Make a square batch, of length equal to the shortest doc. A long
 | 
			
		||||
        doc will get multiple states. Let's say we have a doc of length 2*N,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -148,7 +148,7 @@ cdef class TransitionSystem:
 | 
			
		|||
 | 
			
		||||
    def add_action(self, int action, label_name):
 | 
			
		||||
        cdef attr_t label_id
 | 
			
		||||
        if not isinstance(label_name, int):
 | 
			
		||||
        if not isinstance(label_name, (int, long)):
 | 
			
		||||
            label_id = self.strings.add(label_name)
 | 
			
		||||
        else:
 | 
			
		||||
            label_id = label_name
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,8 @@
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
@pytest.mark.models('en')
 | 
			
		||||
def test_issue1305(EN):
 | 
			
		||||
    '''Test lemmatization of English VBZ'''
 | 
			
		||||
    assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
 | 
			
		||||
    doc = EN(u'This app works well')
 | 
			
		||||
    assert doc[2].lemma_ == 'work'
 | 
			
		||||
| 
						 | 
				
			
			@ -9,11 +9,14 @@ import pytest
 | 
			
		|||
@pytest.mark.models('en')
 | 
			
		||||
def test_issue429(EN):
 | 
			
		||||
    def merge_phrases(matcher, doc, i, matches):
 | 
			
		||||
      if i != len(matches) - 1:
 | 
			
		||||
        return None
 | 
			
		||||
      spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
 | 
			
		||||
      for ent_id, label, span in spans:
 | 
			
		||||
        span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label])
 | 
			
		||||
        if i != len(matches) - 1:
 | 
			
		||||
            return None
 | 
			
		||||
        spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
 | 
			
		||||
        for ent_id, label, span in spans:
 | 
			
		||||
            span.merge(
 | 
			
		||||
                tag=('NNP' if label else span.root.tag_),
 | 
			
		||||
                lemma=span.text,
 | 
			
		||||
                label='PERSON')
 | 
			
		||||
 | 
			
		||||
    doc = EN('a')
 | 
			
		||||
    matcher = Matcher(EN.vocab)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,6 +6,16 @@ from ...strings import StringStore
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_string_hash(stringstore):
 | 
			
		||||
    '''Test that string hashing is stable across platforms'''
 | 
			
		||||
    ss = stringstore
 | 
			
		||||
    assert ss.add('apple') == 8566208034543834098
 | 
			
		||||
    heart = '\U0001f499'
 | 
			
		||||
    print(heart)
 | 
			
		||||
    h = ss.add(heart)
 | 
			
		||||
    assert h == 11841826740069053588
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
def test_stringstore_from_api_docs(stringstore):
 | 
			
		||||
    apple_hash = stringstore.add('apple')
 | 
			
		||||
    assert apple_hash == 8566208034543834098
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import sys
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
 | 
			
		|||
    tokens = tokenizer(text)
 | 
			
		||||
    assert len(tokens) == length
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
 | 
			
		||||
                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
 | 
			
		||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
 | 
			
		||||
    tokens = tokenizer(text)
 | 
			
		||||
    assert len(tokens) == length
 | 
			
		||||
    # These break on narrow unicode builds, e.g. Windows
 | 
			
		||||
    if sys.maxunicode >= 1114111:
 | 
			
		||||
        tokens = tokenizer(text)
 | 
			
		||||
        assert len(tokens) == length
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -17,6 +17,7 @@ fi
 | 
			
		|||
 | 
			
		||||
if [ "${VIA}" == "compile" ]; then
 | 
			
		||||
  pip install -r requirements.txt
 | 
			
		||||
  python setup.py build_ext --inplace
 | 
			
		||||
  pip install -e .
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -282,7 +282,7 @@ p
 | 
			
		|||
        def __call__(self, text):
 | 
			
		||||
            words = text.split(' ')
 | 
			
		||||
            # All tokens 'own' a subsequent space character in this tokenizer
 | 
			
		||||
            spaces = [True] * len(word)
 | 
			
		||||
            spaces = [True] * len(words)
 | 
			
		||||
            return Doc(self.vocab, words=words, spaces=spaces)
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user