mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Wrap try/except around model saving
This commit is contained in:
		
						commit
						c6cd81f192
					
				|  | @ -1 +1,55 @@ | ||||||
|  | environment: | ||||||
|  | 
 | ||||||
|  |   matrix: | ||||||
|  | 
 | ||||||
|  |     # For Python versions available on Appveyor, see | ||||||
|  |     # http://www.appveyor.com/docs/installed-software#python | ||||||
|  |     # The list here is complete (excluding Python 2.6, which | ||||||
|  |     # isn't covered by this document) at the time of writing. | ||||||
|  | 
 | ||||||
|  |     - PYTHON: "C:\\Python27" | ||||||
|  |     #- PYTHON: "C:\\Python33" | ||||||
|  |     #- PYTHON: "C:\\Python34" | ||||||
|  |     #- PYTHON: "C:\\Python35" | ||||||
|  |     #- PYTHON: "C:\\Python27-x64" | ||||||
|  |     #- PYTHON: "C:\\Python33-x64" | ||||||
|  |     #- DISTUTILS_USE_SDK: "1" | ||||||
|  |     #- PYTHON: "C:\\Python34-x64" | ||||||
|  |     #- DISTUTILS_USE_SDK: "1" | ||||||
|  |     #- PYTHON: "C:\\Python35-x64" | ||||||
|  |     - PYTHON: "C:\\Python36-x64" | ||||||
|  | 
 | ||||||
|  | install: | ||||||
|  |   # We need wheel installed to build wheels | ||||||
|  |   - "%PYTHON%\\python.exe -m pip install wheel" | ||||||
|  |   - "%PYTHON%\\python.exe -m pip install cython" | ||||||
|  |   - "%PYTHON%\\python.exe -m pip install -r requirements.txt" | ||||||
|  |   - "%PYTHON%\\python.exe -m pip install -e ." | ||||||
|  | 
 | ||||||
| build: off | build: off | ||||||
|  | 
 | ||||||
|  | test_script: | ||||||
|  |   # Put your test command here. | ||||||
|  |   # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, | ||||||
|  |   # you can remove "build.cmd" from the front of the command, as it's | ||||||
|  |   # only needed to support those cases. | ||||||
|  |   # Note that you must use the environment variable %PYTHON% to refer to | ||||||
|  |   # the interpreter you're using - Appveyor does not do anything special | ||||||
|  |   # to put the Python version you want to use on PATH. | ||||||
|  |   - "%PYTHON%\\python.exe -m pytest spacy/" | ||||||
|  | 
 | ||||||
|  | after_test: | ||||||
|  |   # This step builds your wheels. | ||||||
|  |   # Again, you only need build.cmd if you're building C extensions for | ||||||
|  |   # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct | ||||||
|  |   # interpreter | ||||||
|  |   - "%PYTHON%\\python.exe setup.py bdist_wheel" | ||||||
|  | 
 | ||||||
|  | artifacts: | ||||||
|  |   # bdist_wheel puts your built wheel in the dist directory | ||||||
|  |   - path: dist\* | ||||||
|  | 
 | ||||||
|  | #on_success: | ||||||
|  | #  You can use this step to upload your artifacts to a public website. | ||||||
|  | #  See Appveyor's documentation for more details. Or you can simply | ||||||
|  | #  access your wheels from the Appveyor "artifacts" tab for your build. | ||||||
|  |  | ||||||
							
								
								
									
										11
									
								
								.buildkite/sdist.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								.buildkite/sdist.yml
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | ||||||
|  | steps: | ||||||
|  |   - | ||||||
|  |     command: "fab env clean make test sdist" | ||||||
|  |     label: ":dizzy: :python:" | ||||||
|  |     artifact_paths: "dist/*.tar.gz" | ||||||
|  |   - wait | ||||||
|  |   - trigger: "spacy-sdist-against-models" | ||||||
|  |     label: ":dizzy: :hammer:" | ||||||
|  |     build: | ||||||
|  |       env: | ||||||
|  |         SPACY_VERSION: "{$SPACY_VERSION}" | ||||||
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							|  | @ -1,14 +1,12 @@ | ||||||
| # spaCy | # spaCy | ||||||
| spacy/data/ | spacy/data/ | ||||||
| corpora/ | corpora/ | ||||||
| models/ | /models/ | ||||||
| keys/ | keys/ | ||||||
| 
 | 
 | ||||||
| # Website | # Website | ||||||
| website/www/ | website/www/ | ||||||
| website/_deploy.sh | website/_deploy.sh | ||||||
| website/package.json |  | ||||||
| website/announcement.jade |  | ||||||
| website/.gitignore | website/.gitignore | ||||||
| 
 | 
 | ||||||
| # Cython / C extensions | # Cython / C extensions | ||||||
|  |  | ||||||
|  | @ -1,322 +0,0 @@ | ||||||
| '''WIP --- Doesn't work well yet''' |  | ||||||
| import plac |  | ||||||
| import random |  | ||||||
| import six |  | ||||||
| 
 |  | ||||||
| import cProfile |  | ||||||
| import pstats |  | ||||||
| 
 |  | ||||||
| import pathlib |  | ||||||
| import cPickle as pickle |  | ||||||
| from itertools import izip |  | ||||||
| 
 |  | ||||||
| import spacy |  | ||||||
| 
 |  | ||||||
| import cytoolz |  | ||||||
| import cupy as xp |  | ||||||
| import cupy.cuda |  | ||||||
| import chainer.cuda |  | ||||||
| 
 |  | ||||||
| import chainer.links as L |  | ||||||
| import chainer.functions as F |  | ||||||
| from chainer import Chain, Variable, report |  | ||||||
| import chainer.training |  | ||||||
| import chainer.optimizers |  | ||||||
| from chainer.training import extensions |  | ||||||
| from chainer.iterators import SerialIterator |  | ||||||
| from chainer.datasets import TupleDataset |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class SentimentAnalyser(object): |  | ||||||
|     @classmethod |  | ||||||
|     def load(cls, path, nlp, max_length=100): |  | ||||||
|         raise NotImplementedError |  | ||||||
|         #with (path / 'config.json').open() as file_: |  | ||||||
|         #    model = model_from_json(file_.read()) |  | ||||||
|         #with (path / 'model').open('rb') as file_: |  | ||||||
|         #    lstm_weights = pickle.load(file_) |  | ||||||
|         #embeddings = get_embeddings(nlp.vocab) |  | ||||||
|         #model.set_weights([embeddings] + lstm_weights) |  | ||||||
|         #return cls(model, max_length=max_length) |  | ||||||
| 
 |  | ||||||
|     def __init__(self, model, max_length=100): |  | ||||||
|         self._model = model |  | ||||||
|         self.max_length = max_length |  | ||||||
| 
 |  | ||||||
|     def __call__(self, doc): |  | ||||||
|         X = get_features([doc], self.max_length) |  | ||||||
|         y = self._model.predict(X) |  | ||||||
|         self.set_sentiment(doc, y) |  | ||||||
| 
 |  | ||||||
|     def pipe(self, docs, batch_size=1000, n_threads=2): |  | ||||||
|         for minibatch in cytoolz.partition_all(batch_size, docs): |  | ||||||
|             minibatch = list(minibatch) |  | ||||||
|             sentences = [] |  | ||||||
|             for doc in minibatch: |  | ||||||
|                 sentences.extend(doc.sents) |  | ||||||
|             Xs = get_features(sentences, self.max_length) |  | ||||||
|             ys = self._model.predict(Xs) |  | ||||||
|             for sent, label in zip(sentences, ys): |  | ||||||
|                 sent.doc.sentiment += label - 0.5 |  | ||||||
|             for doc in minibatch: |  | ||||||
|                 yield doc |  | ||||||
| 
 |  | ||||||
|     def set_sentiment(self, doc, y): |  | ||||||
|         doc.sentiment = float(y[0]) |  | ||||||
|         # Sentiment has a native slot for a single float. |  | ||||||
|         # For arbitrary data storage, there's: |  | ||||||
|         # doc.user_data['my_data'] = y |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Classifier(Chain): |  | ||||||
|     def __init__(self, predictor): |  | ||||||
|         super(Classifier, self).__init__(predictor=predictor) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, x, t): |  | ||||||
|         y = self.predictor(x) |  | ||||||
|         loss = F.softmax_cross_entropy(y, t) |  | ||||||
|         accuracy = F.accuracy(y, t) |  | ||||||
|         report({'loss': loss, 'accuracy': accuracy}, self) |  | ||||||
|         return loss |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class SentimentModel(Chain): |  | ||||||
|     def __init__(self, nlp, shape, **settings): |  | ||||||
|         Chain.__init__(self, |  | ||||||
|             embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], |  | ||||||
|                 set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), |  | ||||||
|             encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), |  | ||||||
|             attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), |  | ||||||
|             predict=_Predict(shape['nr_hidden'], shape['nr_class'])) |  | ||||||
|         self.to_gpu(0) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, sentence): |  | ||||||
|         return self.predict( |  | ||||||
|                   self.attend( |  | ||||||
|                       self.encode( |  | ||||||
|                           self.embed(sentence)))) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class _Embed(Chain): |  | ||||||
|     def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): |  | ||||||
|         Chain.__init__(self, |  | ||||||
|             embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), |  | ||||||
|             project=L.Linear(None, nr_out, nobias=True)) |  | ||||||
|         self.embed.W.volatile = False |  | ||||||
| 
 |  | ||||||
|     def __call__(self, sentence): |  | ||||||
|         return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class _Encode(Chain): |  | ||||||
|     def __init__(self, nr_in, nr_out): |  | ||||||
|         Chain.__init__(self, |  | ||||||
|             fwd=L.LSTM(nr_in, nr_out), |  | ||||||
|             bwd=L.LSTM(nr_in, nr_out), |  | ||||||
|             mix=L.Bilinear(nr_out, nr_out, nr_out)) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, sentence): |  | ||||||
|         self.fwd.reset_state() |  | ||||||
|         fwds = map(self.fwd, sentence) |  | ||||||
|         self.bwd.reset_state() |  | ||||||
|         bwds = reversed(map(self.bwd, reversed(sentence))) |  | ||||||
|         return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class _Attend(Chain): |  | ||||||
|     def __init__(self, nr_in, nr_out): |  | ||||||
|         Chain.__init__(self) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, sentence): |  | ||||||
|         sent = sum(sentence) |  | ||||||
|         return sent |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class _Predict(Chain): |  | ||||||
|     def __init__(self, nr_in, nr_out): |  | ||||||
|         Chain.__init__(self, |  | ||||||
|             l1=L.Linear(nr_in, nr_in), |  | ||||||
|             l2=L.Linear(nr_in, nr_out)) |  | ||||||
| 
 |  | ||||||
|     def __call__(self, vector): |  | ||||||
|         vector = self.l1(vector) |  | ||||||
|         vector = F.elu(vector) |  | ||||||
|         vector = self.l2(vector) |  | ||||||
|         return vector |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class SentenceDataset(TupleDataset): |  | ||||||
|     def __init__(self, nlp, texts, labels, max_length): |  | ||||||
|         self.max_length = max_length |  | ||||||
|         sents, labels = self._get_labelled_sentences( |  | ||||||
|             nlp.pipe(texts, batch_size=5000, n_threads=3), |  | ||||||
|             labels) |  | ||||||
|         TupleDataset.__init__(self, |  | ||||||
|             get_features(sents, max_length), |  | ||||||
|             labels) |  | ||||||
| 
 |  | ||||||
|     def __getitem__(self, index): |  | ||||||
|         batches = [dataset[index] for dataset in self._datasets] |  | ||||||
|         if isinstance(index, slice): |  | ||||||
|             length = len(batches[0]) |  | ||||||
|             returns = [tuple([batch[i] for batch in batches]) |  | ||||||
|                        for i in six.moves.range(length)] |  | ||||||
|             return returns |  | ||||||
|         else: |  | ||||||
|             return tuple(batches) |  | ||||||
| 
 |  | ||||||
|     def _get_labelled_sentences(self, docs, doc_labels): |  | ||||||
|         labels = [] |  | ||||||
|         sentences = [] |  | ||||||
|         for doc, y in izip(docs, doc_labels): |  | ||||||
|             for sent in doc.sents: |  | ||||||
|                 sentences.append(sent) |  | ||||||
|                 labels.append(y) |  | ||||||
|         return sentences, xp.asarray(labels, dtype='i') |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class DocDataset(TupleDataset): |  | ||||||
|     def __init__(self, nlp, texts, labels): |  | ||||||
|         self.max_length = max_length |  | ||||||
|         DatasetMixin.__init__(self, |  | ||||||
|             get_features( |  | ||||||
|                 nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), |  | ||||||
|             labels) |  | ||||||
| 
 |  | ||||||
| def read_data(data_dir, limit=0): |  | ||||||
|     examples = [] |  | ||||||
|     for subdir, label in (('pos', 1), ('neg', 0)): |  | ||||||
|         for filename in (data_dir / subdir).iterdir(): |  | ||||||
|             with filename.open() as file_: |  | ||||||
|                 text = file_.read() |  | ||||||
|             examples.append((text, label)) |  | ||||||
|     random.shuffle(examples) |  | ||||||
|     if limit >= 1: |  | ||||||
|         examples = examples[:limit] |  | ||||||
|     return zip(*examples) # Unzips into two lists |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def get_features(docs, max_length): |  | ||||||
|     docs = list(docs) |  | ||||||
|     Xs = xp.zeros((len(docs), max_length), dtype='i') |  | ||||||
|     for i, doc in enumerate(docs): |  | ||||||
|         j = 0 |  | ||||||
|         for token in doc: |  | ||||||
|             if token.has_vector and not token.is_punct and not token.is_space: |  | ||||||
|                 Xs[i, j] = token.norm |  | ||||||
|                 j += 1 |  | ||||||
|                 if j >= max_length: |  | ||||||
|                     break |  | ||||||
|     return Xs |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def set_vectors(vectors, vocab): |  | ||||||
|     for lex in vocab: |  | ||||||
|         if lex.has_vector and (lex.rank+1) < vectors.shape[0]: |  | ||||||
|             lex.norm = lex.rank+1 |  | ||||||
|             vectors[lex.rank + 1] = lex.vector |  | ||||||
|         else: |  | ||||||
|             lex.norm = 0 |  | ||||||
|     return vectors |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def train(train_texts, train_labels, dev_texts, dev_labels, |  | ||||||
|         lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, |  | ||||||
|         by_sentence=True): |  | ||||||
|     nlp = spacy.load('en', entity=False) |  | ||||||
|     if 'nr_vector' not in lstm_shape: |  | ||||||
|         lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) |  | ||||||
|     if 'nr_dim' not in lstm_shape: |  | ||||||
|         lstm_shape['nr_dim'] = nlp.vocab.vectors_length |  | ||||||
|     print("Make model") |  | ||||||
|     model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) |  | ||||||
|     print("Parsing texts...") |  | ||||||
|     if by_sentence: |  | ||||||
|         train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) |  | ||||||
|         dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) |  | ||||||
|     else: |  | ||||||
|         train_data = DocDataset(nlp, train_texts, train_labels) |  | ||||||
|         dev_data = DocDataset(nlp, dev_texts, dev_labels) |  | ||||||
|     train_iter = SerialIterator(train_data, batch_size=batch_size, |  | ||||||
|                                 shuffle=True, repeat=True) |  | ||||||
|     dev_iter = SerialIterator(dev_data, batch_size=batch_size, |  | ||||||
|                               shuffle=False, repeat=False) |  | ||||||
|     optimizer = chainer.optimizers.Adam() |  | ||||||
|     optimizer.setup(model) |  | ||||||
|     updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) |  | ||||||
|     trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') |  | ||||||
| 
 |  | ||||||
|     trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) |  | ||||||
|     trainer.extend(extensions.LogReport()) |  | ||||||
|     trainer.extend(extensions.PrintReport([ |  | ||||||
|         'epoch', 'main/accuracy', 'validation/main/accuracy'])) |  | ||||||
|     trainer.extend(extensions.ProgressBar()) |  | ||||||
|      |  | ||||||
|     trainer.run() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def evaluate(model_dir, texts, labels, max_length=100): |  | ||||||
|     def create_pipeline(nlp): |  | ||||||
|         ''' |  | ||||||
|         This could be a lambda, but named functions are easier to read in Python. |  | ||||||
|         ''' |  | ||||||
|         return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, |  | ||||||
|                                                                max_length=max_length)] |  | ||||||
|      |  | ||||||
|     nlp = spacy.load('en') |  | ||||||
|     nlp.pipeline = create_pipeline(nlp) |  | ||||||
| 
 |  | ||||||
|     correct = 0 |  | ||||||
|     i = 0  |  | ||||||
|     for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): |  | ||||||
|         correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) |  | ||||||
|         i += 1 |  | ||||||
|     return float(correct) / i |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @plac.annotations( |  | ||||||
|     train_dir=("Location of training file or directory"), |  | ||||||
|     dev_dir=("Location of development file or directory"), |  | ||||||
|     model_dir=("Location of output model directory",), |  | ||||||
|     is_runtime=("Demonstrate run-time usage", "flag", "r", bool), |  | ||||||
|     nr_hidden=("Number of hidden units", "option", "H", int), |  | ||||||
|     max_length=("Maximum sentence length", "option", "L", int), |  | ||||||
|     dropout=("Dropout", "option", "d", float), |  | ||||||
|     learn_rate=("Learn rate", "option", "e", float), |  | ||||||
|     nb_epoch=("Number of training epochs", "option", "i", int), |  | ||||||
|     batch_size=("Size of minibatches for training LSTM", "option", "b", int), |  | ||||||
|     nr_examples=("Limit to N examples", "option", "n", int) |  | ||||||
| ) |  | ||||||
| def main(model_dir, train_dir, dev_dir, |  | ||||||
|          is_runtime=False, |  | ||||||
|          nr_hidden=64, max_length=100, # Shape |  | ||||||
|          dropout=0.5, learn_rate=0.001, # General NN config |  | ||||||
|          nb_epoch=5, batch_size=32, nr_examples=-1):  # Training params |  | ||||||
|     model_dir = pathlib.Path(model_dir) |  | ||||||
|     train_dir = pathlib.Path(train_dir) |  | ||||||
|     dev_dir = pathlib.Path(dev_dir) |  | ||||||
|     if is_runtime: |  | ||||||
|         dev_texts, dev_labels = read_data(dev_dir) |  | ||||||
|         acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) |  | ||||||
|         print(acc) |  | ||||||
|     else: |  | ||||||
|         print("Read data") |  | ||||||
|         train_texts, train_labels = read_data(train_dir, limit=nr_examples) |  | ||||||
|         dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) |  | ||||||
|         print("Using GPU 0") |  | ||||||
|         #chainer.cuda.get_device(0).use() |  | ||||||
|         train_labels = xp.asarray(train_labels, dtype='i') |  | ||||||
|         dev_labels = xp.asarray(dev_labels, dtype='i') |  | ||||||
|         lstm = train(train_texts, train_labels, dev_texts, dev_labels, |  | ||||||
|                      {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, |  | ||||||
|                       'nr_vector': 5000}, |  | ||||||
|                       {'dropout': 0.5, 'lr': learn_rate}, |  | ||||||
|                       {}, |  | ||||||
|                       nb_epoch=nb_epoch, batch_size=batch_size) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") |  | ||||||
|     #s = pstats.Stats("Profile.prof") |  | ||||||
|     #s.strip_dirs().sort_stats("time").print_stats() |  | ||||||
|     plac.call(main) |  | ||||||
|  | @ -20,71 +20,71 @@ The algorithm is O(n) at run-time for document of length n because we're only ev | ||||||
| matching over the tag patterns. So no matter how many phrases we're looking for, | matching over the tag patterns. So no matter how many phrases we're looking for, | ||||||
| our pattern set stays very small (exact size depends on the maximum length we're | our pattern set stays very small (exact size depends on the maximum length we're | ||||||
| looking for, as the query language currently has no quantifiers) | looking for, as the query language currently has no quantifiers) | ||||||
|  | 
 | ||||||
|  | The example expects a .bz2 file from the Reddit corpus, and a patterns file, | ||||||
|  | formatted in jsonl as a sequence of entries like this: | ||||||
|  | 
 | ||||||
|  | {"text":"Anchorage"} | ||||||
|  | {"text":"Angola"} | ||||||
|  | {"text":"Ann Arbor"} | ||||||
|  | {"text":"Annapolis"} | ||||||
|  | {"text":"Appalachia"} | ||||||
|  | {"text":"Argentina"} | ||||||
| """ | """ | ||||||
| from __future__ import print_function, unicode_literals, division | from __future__ import print_function, unicode_literals, division | ||||||
| from ast import literal_eval |  | ||||||
| from bz2 import BZ2File | from bz2 import BZ2File | ||||||
| import time | import time | ||||||
| import math | import math | ||||||
| import codecs | import codecs | ||||||
| 
 | 
 | ||||||
| import plac | import plac | ||||||
|  | import ujson | ||||||
| 
 | 
 | ||||||
| from preshed.maps import PreshMap |  | ||||||
| from preshed.counter import PreshCounter |  | ||||||
| from spacy.strings import hash_string |  | ||||||
| from spacy.en import English |  | ||||||
| from spacy.matcher import PhraseMatcher | from spacy.matcher import PhraseMatcher | ||||||
|  | import spacy | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_gazetteer(tokenizer, loc, n=-1): | def read_gazetteer(tokenizer, loc, n=-1): | ||||||
|     for i, line in enumerate(open(loc)): |     for i, line in enumerate(open(loc)): | ||||||
|         phrase = literal_eval('u' + line.strip()) |         data = ujson.loads(line.strip()) | ||||||
|         if ' (' in phrase and phrase.endswith(')'): |         phrase = tokenizer(data['text']) | ||||||
|             phrase = phrase.split(' (', 1)[0] |         for w in phrase: | ||||||
|         if i >= n: |             _ = tokenizer.vocab[w.text] | ||||||
|             break |  | ||||||
|         phrase = tokenizer(phrase) |  | ||||||
|         if all((t.is_lower and t.prob >= -10) for t in phrase): |  | ||||||
|             continue |  | ||||||
|         if len(phrase) >= 2: |         if len(phrase) >= 2: | ||||||
|             yield phrase |             yield phrase | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_text(bz2_loc): | def read_text(bz2_loc, n=10000): | ||||||
|     with BZ2File(bz2_loc) as file_: |     with BZ2File(bz2_loc) as file_: | ||||||
|         for line in file_: |         for i, line in enumerate(file_): | ||||||
|             yield line.decode('utf8') |             data = ujson.loads(line) | ||||||
|  |             yield data['body'] | ||||||
|  |             if i >= n: | ||||||
|  |                 break | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_matches(tokenizer, phrases, texts, max_length=6): | def get_matches(tokenizer, phrases, texts, max_length=6): | ||||||
|     matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) |     matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) | ||||||
|     print("Match") |     matcher.add('Phrase', None, *phrases) | ||||||
|     for text in texts: |     for text in texts: | ||||||
|         doc = tokenizer(text) |         doc = tokenizer(text) | ||||||
|  |         for w in doc: | ||||||
|  |             _ = doc.vocab[w.text] | ||||||
|         matches = matcher(doc) |         matches = matcher(doc) | ||||||
|         for mwe in doc.ents: |         for ent_id, start, end in matches: | ||||||
|             yield mwe |             yield (ent_id, doc[start:end].text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(patterns_loc, text_loc, counts_loc, n=10000000): | def main(patterns_loc, text_loc, n=10000): | ||||||
|     nlp = English(parser=False, tagger=False, entity=False) |     nlp = spacy.blank('en') | ||||||
|     print("Make matcher") |     nlp.vocab.lex_attr_getters = {} | ||||||
|     phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) |     phrases = read_gazetteer(nlp.tokenizer, patterns_loc) | ||||||
|     counts = PreshCounter() |     count = 0 | ||||||
|     t1 = time.time() |     t1 = time.time() | ||||||
|     for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): |     for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): | ||||||
|         counts.inc(hash_string(mwe.text), 1) |         count += 1 | ||||||
|     t2 = time.time() |     t2 = time.time() | ||||||
|     print("10m tokens in %d s" % (t2 - t1)) |     print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) | ||||||
|      |  | ||||||
|     with codecs.open(counts_loc, 'w', 'utf8') as file_: |  | ||||||
|         for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): |  | ||||||
|             text = phrase.string |  | ||||||
|             key = hash_string(text) |  | ||||||
|             count = counts[key] |  | ||||||
|             if count != 0: |  | ||||||
|                 file_.write('%d\t%s\n' % (count, text)) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|  | @ -13,24 +13,29 @@ Input data: | ||||||
| https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip | https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 1.7.1 | Developed for: spaCy 1.7.1 | ||||||
| Last tested for: spaCy 1.7.1 | Last tested for: spaCy 2.0.0a13 | ||||||
| ''' | ''' | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| import plac | import plac | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import random | import random | ||||||
| import json | import json | ||||||
|  | import tqdm | ||||||
|  | 
 | ||||||
|  | from thinc.neural.optimizers import Adam | ||||||
|  | from thinc.neural.ops import NumpyOps | ||||||
| 
 | 
 | ||||||
| import spacy.orth as orth_funcs |  | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from spacy.pipeline import BeamEntityRecognizer | from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer | ||||||
| from spacy.pipeline import EntityRecognizer |  | ||||||
| from spacy.tokenizer import Tokenizer | from spacy.tokenizer import Tokenizer | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
| from spacy.attrs import * | from spacy.attrs import * | ||||||
| from spacy.gold import GoldParse | from spacy.gold import GoldParse | ||||||
| from spacy.gold import _iob_to_biluo as iob_to_biluo | from spacy.gold import iob_to_biluo | ||||||
|  | from spacy.gold import minibatch | ||||||
| from spacy.scorer import Scorer | from spacy.scorer import Scorer | ||||||
|  | import spacy.util | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     unicode |     unicode | ||||||
|  | @ -38,96 +43,38 @@ except NameError: | ||||||
|     unicode = str |     unicode = str | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | spacy.util.set_env_log(True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def init_vocab(): | def init_vocab(): | ||||||
|     return Vocab( |     return Vocab( | ||||||
|         lex_attr_getters={ |         lex_attr_getters={ | ||||||
|             LOWER: lambda string: string.lower(), |             LOWER: lambda string: string.lower(), | ||||||
|             SHAPE: orth_funcs.word_shape, |             NORM: lambda string: string.lower(), | ||||||
|             PREFIX: lambda string: string[0], |             PREFIX: lambda string: string[0], | ||||||
|             SUFFIX: lambda string: string[-3:], |             SUFFIX: lambda string: string[-3:], | ||||||
|             CLUSTER: lambda string: 0, |  | ||||||
|             IS_ALPHA: orth_funcs.is_alpha, |  | ||||||
|             IS_ASCII: orth_funcs.is_ascii, |  | ||||||
|             IS_DIGIT: lambda string: string.isdigit(), |  | ||||||
|             IS_LOWER: orth_funcs.is_lower, |  | ||||||
|             IS_PUNCT: orth_funcs.is_punct, |  | ||||||
|             IS_SPACE: lambda string: string.isspace(), |  | ||||||
|             IS_TITLE: orth_funcs.is_title, |  | ||||||
|             IS_UPPER: orth_funcs.is_upper, |  | ||||||
|             IS_STOP: lambda string: False, |  | ||||||
|             IS_OOV: lambda string: True |  | ||||||
|         }) |         }) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def save_vocab(vocab, path): |  | ||||||
|     path = Path(path) |  | ||||||
|     if not path.exists(): |  | ||||||
|         path.mkdir() |  | ||||||
|     elif not path.is_dir(): |  | ||||||
|         raise IOError("Can't save vocab to %s\nNot a directory" % path) |  | ||||||
|     with (path / 'strings.json').open('w') as file_: |  | ||||||
|         vocab.strings.dump(file_) |  | ||||||
|     vocab.dump((path / 'lexemes.bin').as_posix()) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_vocab(path): |  | ||||||
|     path = Path(path) |  | ||||||
|     if not path.exists(): |  | ||||||
|         raise IOError("Cannot load vocab from %s\nDoes not exist" % path) |  | ||||||
|     if not path.is_dir(): |  | ||||||
|         raise IOError("Cannot load vocab from %s\nNot a directory" % path) |  | ||||||
|     return Vocab.load(path) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def init_ner_model(vocab, features=None): |  | ||||||
|     if features is None: |  | ||||||
|         features = tuple(EntityRecognizer.feature_templates) |  | ||||||
|     return EntityRecognizer(vocab, features=features) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def save_ner_model(model, path): |  | ||||||
|     path = Path(path) |  | ||||||
|     if not path.exists(): |  | ||||||
|         path.mkdir() |  | ||||||
|     if not path.is_dir(): |  | ||||||
|         raise IOError("Can't save model to %s\nNot a directory" % path) |  | ||||||
|     model.model.dump((path / 'model').as_posix()) |  | ||||||
|     with (path / 'config.json').open('w') as file_: |  | ||||||
|         data = json.dumps(model.cfg) |  | ||||||
|         if not isinstance(data, unicode): |  | ||||||
|             data = data.decode('utf8') |  | ||||||
|         file_.write(data) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def load_ner_model(vocab, path): |  | ||||||
|     return EntityRecognizer.load(path, vocab) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class Pipeline(object): | class Pipeline(object): | ||||||
|     @classmethod |  | ||||||
|     def load(cls, path): |  | ||||||
|         path = Path(path) |  | ||||||
|         if not path.exists(): |  | ||||||
|             raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) |  | ||||||
|         if not path.is_dir(): |  | ||||||
|             raise IOError("Cannot load pipeline from %s\nNot a directory" % path) |  | ||||||
|         vocab = load_vocab(path) |  | ||||||
|         tokenizer = Tokenizer(vocab, {}, None, None, None) |  | ||||||
|         ner_model = load_ner_model(vocab, path / 'ner') |  | ||||||
|         return cls(vocab, tokenizer, ner_model) |  | ||||||
| 
 |  | ||||||
|     def __init__(self, vocab=None, tokenizer=None, entity=None): |     def __init__(self, vocab=None, tokenizer=None, entity=None): | ||||||
|         if vocab is None: |         if vocab is None: | ||||||
|             vocab = init_vocab() |             vocab = init_vocab() | ||||||
|         if tokenizer is None: |         if tokenizer is None: | ||||||
|             tokenizer = Tokenizer(vocab, {}, None, None, None) |             tokenizer = Tokenizer(vocab, {}, None, None, None) | ||||||
|         if entity is None: |         if entity is None: | ||||||
|             entity = init_ner_model(self.vocab) |             entity = NeuralEntityRecognizer(vocab) | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.tokenizer = tokenizer |         self.tokenizer = tokenizer | ||||||
|         self.entity = entity |         self.entity = entity | ||||||
|         self.pipeline = [self.entity] |         self.pipeline = [self.entity] | ||||||
| 
 | 
 | ||||||
|  |     def begin_training(self): | ||||||
|  |         for model in self.pipeline: | ||||||
|  |             model.begin_training([]) | ||||||
|  |         optimizer = Adam(NumpyOps(), 0.001) | ||||||
|  |         return optimizer | ||||||
|  | 
 | ||||||
|     def __call__(self, input_): |     def __call__(self, input_): | ||||||
|         doc = self.make_doc(input_) |         doc = self.make_doc(input_) | ||||||
|         for process in self.pipeline: |         for process in self.pipeline: | ||||||
|  | @ -147,14 +94,16 @@ class Pipeline(object): | ||||||
|         gold = GoldParse(doc, entities=annotations) |         gold = GoldParse(doc, entities=annotations) | ||||||
|         return gold |         return gold | ||||||
| 
 | 
 | ||||||
|     def update(self, input_, annot): |     def update(self, inputs, annots, sgd, losses=None, drop=0.): | ||||||
|         doc = self.make_doc(input_) |         if losses is None: | ||||||
|         gold = self.make_gold(input_, annot) |             losses = {} | ||||||
|         for ner in gold.ner: |         docs = [self.make_doc(input_) for input_ in inputs] | ||||||
|             if ner not in (None, '-', 'O'): |         golds = [self.make_gold(input_, annot) for input_, annot in | ||||||
|                 action, label = ner.split('-', 1) |                  zip(inputs, annots)] | ||||||
|                 self.entity.add_label(label) | 
 | ||||||
|         return self.entity.update(doc, gold) |         self.entity.update(docs, golds, drop=drop, | ||||||
|  |                            sgd=sgd, losses=losses) | ||||||
|  |         return losses | ||||||
| 
 | 
 | ||||||
|     def evaluate(self, examples): |     def evaluate(self, examples): | ||||||
|         scorer = Scorer() |         scorer = Scorer() | ||||||
|  | @ -164,34 +113,36 @@ class Pipeline(object): | ||||||
|             scorer.score(doc, gold) |             scorer.score(doc, gold) | ||||||
|         return scorer.scores |         return scorer.scores | ||||||
| 
 | 
 | ||||||
|     def average_weights(self): |     def to_disk(self, path): | ||||||
|         self.entity.model.end_training() |  | ||||||
| 
 |  | ||||||
|     def save(self, path): |  | ||||||
|         path = Path(path) |         path = Path(path) | ||||||
|         if not path.exists(): |         if not path.exists(): | ||||||
|             path.mkdir() |             path.mkdir() | ||||||
|         elif not path.is_dir(): |         elif not path.is_dir(): | ||||||
|             raise IOError("Can't save pipeline to %s\nNot a directory" % path) |             raise IOError("Can't save pipeline to %s\nNot a directory" % path) | ||||||
|         save_vocab(self.vocab, path / 'vocab') |         self.vocab.to_disk(path / 'vocab') | ||||||
|         save_ner_model(self.entity, path / 'ner') |         self.entity.to_disk(path / 'ner') | ||||||
|  | 
 | ||||||
|  |     def from_disk(self, path): | ||||||
|  |         path = Path(path) | ||||||
|  |         if not path.exists(): | ||||||
|  |             raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) | ||||||
|  |         if not path.is_dir(): | ||||||
|  |             raise IOError("Cannot load pipeline from %s\nNot a directory" % path) | ||||||
|  |         self.vocab = self.vocab.from_disk(path / 'vocab') | ||||||
|  |         self.entity = self.entity.from_disk(path / 'ner') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): | def train(nlp, train_examples, dev_examples, nr_epoch=5): | ||||||
|     next_epoch = train_examples |     sgd = nlp.begin_training() | ||||||
|     print("Iter", "Loss", "P", "R", "F") |     print("Iter", "Loss", "P", "R", "F") | ||||||
|     for i in range(nr_epoch): |     for i in range(nr_epoch): | ||||||
|         this_epoch = next_epoch |         random.shuffle(train_examples) | ||||||
|         next_epoch = [] |         losses = {} | ||||||
|         loss = 0 |         for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8): | ||||||
|         for input_, annot in this_epoch: |             inputs, annots = zip(*batch) | ||||||
|             loss += nlp.update(input_, annot) |             nlp.update(list(inputs), list(annots), sgd, losses=losses) | ||||||
|             if (i+1) < nr_epoch: |  | ||||||
|                 next_epoch.append((input_, annot)) |  | ||||||
|         random.shuffle(next_epoch) |  | ||||||
|         scores = nlp.evaluate(dev_examples) |         scores = nlp.evaluate(dev_examples) | ||||||
|         report_scores(i, loss, scores) |         report_scores(i, losses['ner'], scores) | ||||||
|     nlp.average_weights() |  | ||||||
|     scores = nlp.evaluate(dev_examples) |     scores = nlp.evaluate(dev_examples) | ||||||
|     report_scores(channels, i+1, loss, scores) |     report_scores(channels, i+1, loss, scores) | ||||||
| 
 | 
 | ||||||
|  | @ -208,7 +159,8 @@ def read_examples(path): | ||||||
|     with path.open() as file_: |     with path.open() as file_: | ||||||
|         sents = file_.read().strip().split('\n\n') |         sents = file_.read().strip().split('\n\n') | ||||||
|         for sent in sents: |         for sent in sents: | ||||||
|             if not sent.strip(): |             sent = sent.strip() | ||||||
|  |             if not sent: | ||||||
|                 continue |                 continue | ||||||
|             tokens = sent.split('\n') |             tokens = sent.split('\n') | ||||||
|             while tokens and tokens[0].startswith('#'): |             while tokens and tokens[0].startswith('#'): | ||||||
|  | @ -217,28 +169,39 @@ def read_examples(path): | ||||||
|             iob = [] |             iob = [] | ||||||
|             for token in tokens: |             for token in tokens: | ||||||
|                 if token.strip(): |                 if token.strip(): | ||||||
|                     pieces = token.split() |                     pieces = token.split('\t') | ||||||
|                     words.append(pieces[1]) |                     words.append(pieces[1]) | ||||||
|                     iob.append(pieces[2]) |                     iob.append(pieces[2]) | ||||||
|             yield words, iob_to_biluo(iob) |             yield words, iob_to_biluo(iob) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def get_labels(examples): | ||||||
|  |     labels = set() | ||||||
|  |     for words, tags in examples: | ||||||
|  |         for tag in tags: | ||||||
|  |             if '-' in tag: | ||||||
|  |                 labels.add(tag.split('-')[1]) | ||||||
|  |     return sorted(labels) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|     model_dir=("Path to save the model", "positional", None, Path), |     model_dir=("Path to save the model", "positional", None, Path), | ||||||
|     train_loc=("Path to your training data", "positional", None, Path), |     train_loc=("Path to your training data", "positional", None, Path), | ||||||
|     dev_loc=("Path to your development data", "positional", None, Path), |     dev_loc=("Path to your development data", "positional", None, Path), | ||||||
| ) | ) | ||||||
| def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), | def main(model_dir, train_loc, dev_loc, nr_epoch=30): | ||||||
|         train_loc=None, dev_loc=None, nr_epoch=30): |     print(model_dir, train_loc, dev_loc) | ||||||
|      |     train_examples = list(read_examples(train_loc)) | ||||||
|     train_examples = read_examples(train_loc) |  | ||||||
|     dev_examples = read_examples(dev_loc) |     dev_examples = read_examples(dev_loc) | ||||||
|     nlp = Pipeline.load(model_dir) |     nlp = Pipeline() | ||||||
|  |     for label in get_labels(train_examples): | ||||||
|  |         nlp.entity.add_label(label) | ||||||
|  |         print("Add label", label) | ||||||
| 
 | 
 | ||||||
|     train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) |     train(nlp, train_examples, list(dev_examples), nr_epoch) | ||||||
| 
 | 
 | ||||||
|     nlp.save(model_dir) |     nlp.to_disk(model_dir) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     main() |     plac.call(main) | ||||||
|  |  | ||||||
|  | @ -25,7 +25,7 @@ For more details, see the documentation: | ||||||
| * Saving and loading models: https://spacy.io/docs/usage/saving-loading | * Saving and loading models: https://spacy.io/docs/usage/saving-loading | ||||||
| 
 | 
 | ||||||
| Developed for: spaCy 1.7.6 | Developed for: spaCy 1.7.6 | ||||||
| Last tested for: spaCy 1.7.6 | Last updated for: spaCy 2.0.0a13 | ||||||
| """ | """ | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
|  | @ -34,55 +34,41 @@ from pathlib import Path | ||||||
| import random | import random | ||||||
| 
 | 
 | ||||||
| import spacy | import spacy | ||||||
| from spacy.gold import GoldParse | from spacy.gold import GoldParse, minibatch | ||||||
| from spacy.tagger import Tagger | from spacy.pipeline import NeuralEntityRecognizer | ||||||
|  | from spacy.pipeline import TokenVectorEncoder | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_gold_parses(tokenizer, train_data): | ||||||
|  |     '''Shuffle and create GoldParse objects''' | ||||||
|  |     random.shuffle(train_data) | ||||||
|  |     for raw_text, entity_offsets in train_data: | ||||||
|  |         doc = tokenizer(raw_text) | ||||||
|  |         gold = GoldParse(doc, entities=entity_offsets) | ||||||
|  |         yield doc, gold | ||||||
| 
 | 
 | ||||||
|   |   | ||||||
| def train_ner(nlp, train_data, output_dir): | def train_ner(nlp, train_data, output_dir): | ||||||
|     # Add new words to vocab |  | ||||||
|     for raw_text, _ in train_data: |  | ||||||
|         doc = nlp.make_doc(raw_text) |  | ||||||
|         for word in doc: |  | ||||||
|             _ = nlp.vocab[word.orth] |  | ||||||
|     random.seed(0) |     random.seed(0) | ||||||
|     # You may need to change the learning rate. It's generally difficult to |     optimizer = nlp.begin_training(lambda: []) | ||||||
|     # guess what rate you should set, especially when you have limited data. |     nlp.meta['name'] = 'en_ent_animal' | ||||||
|     nlp.entity.model.learn_rate = 0.001 |     for itn in range(50): | ||||||
|     for itn in range(1000): |         losses = {} | ||||||
|         random.shuffle(train_data) |         for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3): | ||||||
|         loss = 0. |             docs, golds = zip(*batch) | ||||||
|         for raw_text, entity_offsets in train_data: |             nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True, | ||||||
|             gold = GoldParse(doc, entities=entity_offsets) |                        drop=0.35) | ||||||
|             # By default, the GoldParse class assumes that the entities |         print(losses) | ||||||
|             # described by offset are complete, and all other words should |     if not output_dir: | ||||||
|             # have the tag 'O'. You can tell it to make no assumptions |         return | ||||||
|             # about the tag of a word by giving it the tag '-'. |     elif not output_dir.exists(): | ||||||
|             # However, this allows a trivial solution to the current |         output_dir.mkdir() | ||||||
|             # learning problem: if words are either 'any tag' or 'ANIMAL', |     nlp.to_disk(output_dir) | ||||||
|             # the model can learn that all words can be tagged 'ANIMAL'. |  | ||||||
|             #for i in range(len(gold.ner)): |  | ||||||
|                 #if not gold.ner[i].endswith('ANIMAL'): |  | ||||||
|                 #    gold.ner[i] = '-' |  | ||||||
|             doc = nlp.make_doc(raw_text) |  | ||||||
|             nlp.tagger(doc) |  | ||||||
|             # As of 1.9, spaCy's parser now lets you supply a dropout probability |  | ||||||
|             # This might help the model generalize better from only a few |  | ||||||
|             # examples. |  | ||||||
|             loss += nlp.entity.update(doc, gold, drop=0.9) |  | ||||||
|         if loss == 0: |  | ||||||
|             break |  | ||||||
|     # This step averages the model's weights. This may or may not be good for |  | ||||||
|     # your situation --- it's empirical. |  | ||||||
|     nlp.end_training() |  | ||||||
|     if output_dir: |  | ||||||
|         if not output_dir.exists(): |  | ||||||
|             output_dir.mkdir() |  | ||||||
|         nlp.save_to_directory(output_dir) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(model_name, output_directory=None): | def main(model_name, output_directory=None): | ||||||
|     print("Loading initial model", model_name) |     print("Creating initial model", model_name) | ||||||
|     nlp = spacy.load(model_name) |     nlp = spacy.blank(model_name) | ||||||
|     if output_directory is not None: |     if output_directory is not None: | ||||||
|         output_directory = Path(output_directory) |         output_directory = Path(output_directory) | ||||||
| 
 | 
 | ||||||
|  | @ -91,6 +77,11 @@ def main(model_name, output_directory=None): | ||||||
|             "Horses are too tall and they pretend to care about your feelings", |             "Horses are too tall and they pretend to care about your feelings", | ||||||
|             [(0, 6, 'ANIMAL')], |             [(0, 6, 'ANIMAL')], | ||||||
|         ), |         ), | ||||||
|  |         ( | ||||||
|  |             "Do they bite?",  | ||||||
|  |             [], | ||||||
|  |         ), | ||||||
|  |   | ||||||
|         ( |         ( | ||||||
|             "horses are too tall and they pretend to care about your feelings", |             "horses are too tall and they pretend to care about your feelings", | ||||||
|             [(0, 6, 'ANIMAL')] |             [(0, 6, 'ANIMAL')] | ||||||
|  | @ -109,18 +100,20 @@ def main(model_name, output_directory=None): | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     ] |     ] | ||||||
|     nlp.entity.add_label('ANIMAL') |     nlp.pipeline.append(TokenVectorEncoder(nlp.vocab)) | ||||||
|  |     nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab)) | ||||||
|  |     nlp.pipeline[-1].add_label('ANIMAL') | ||||||
|     train_ner(nlp, train_data, output_directory) |     train_ner(nlp, train_data, output_directory) | ||||||
| 
 | 
 | ||||||
|     # Test that the entity is recognized |     # Test that the entity is recognized | ||||||
|     doc = nlp('Do you like horses?') |     text = 'Do you like horses?' | ||||||
|     print("Ents in 'Do you like horses?':") |     print("Ents in 'Do you like horses?':") | ||||||
|  |     doc = nlp(text) | ||||||
|     for ent in doc.ents: |     for ent in doc.ents: | ||||||
|         print(ent.label_, ent.text) |         print(ent.label_, ent.text) | ||||||
|     if output_directory: |     if output_directory: | ||||||
|         print("Loading from", output_directory) |         print("Loading from", output_directory) | ||||||
|         nlp2 = spacy.load('en', path=output_directory) |         nlp2 = spacy.load(output_directory) | ||||||
|         nlp2.entity.add_label('ANIMAL') |  | ||||||
|         doc2 = nlp2('Do you like horses?') |         doc2 = nlp2('Do you like horses?') | ||||||
|         for ent in doc2.ents: |         for ent in doc2.ents: | ||||||
|             print(ent.label_, ent.text) |             print(ent.label_, ent.text) | ||||||
|  |  | ||||||
|  | @ -1,3 +1,7 @@ | ||||||
|  | '''Train a multi-label convolutional neural network text classifier, | ||||||
|  | using the spacy.pipeline.TextCategorizer component. The model is then added | ||||||
|  | to spacy.pipeline, and predictions are available at `doc.cats`. | ||||||
|  | ''' | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| import plac | import plac | ||||||
| import random | import random | ||||||
|  | @ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch | ||||||
| from spacy.util import compounding | from spacy.util import compounding | ||||||
| from spacy.pipeline import TextCategorizer | from spacy.pipeline import TextCategorizer | ||||||
| 
 | 
 | ||||||
|  | # TODO: Remove this once we're not supporting models trained with thinc <6.9.0 | ||||||
|  | import thinc.neural._classes.layernorm | ||||||
|  | thinc.neural._classes.layernorm.set_compat_six_eight(False) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def train_textcat(tokenizer, textcat, | def train_textcat(tokenizer, textcat, | ||||||
|                   train_texts, train_cats, dev_texts, dev_cats, |                   train_texts, train_cats, dev_texts, dev_cats, | ||||||
|  | @ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat, | ||||||
|     train_docs = [tokenizer(text) for text in train_texts] |     train_docs = [tokenizer(text) for text in train_texts] | ||||||
|     train_gold = [GoldParse(doc, cats=cats) for doc, cats in |     train_gold = [GoldParse(doc, cats=cats) for doc, cats in | ||||||
|                   zip(train_docs, train_cats)] |                   zip(train_docs, train_cats)] | ||||||
|     train_data = zip(train_docs, train_gold) |     train_data = list(zip(train_docs, train_gold)) | ||||||
|     batch_sizes = compounding(4., 128., 1.001) |     batch_sizes = compounding(4., 128., 1.001) | ||||||
|     for i in range(n_iter): |     for i in range(n_iter): | ||||||
|         losses = {} |         losses = {} | ||||||
|         train_data = tqdm.tqdm(train_data, leave=False) # Progress bar |         # Progress bar and minibatching | ||||||
|         for batch in minibatch(train_data, size=batch_sizes): |         batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes) | ||||||
|  |         for batch in batches: | ||||||
|             docs, golds = zip(*batch) |             docs, golds = zip(*batch) | ||||||
|             textcat.update((docs, None), golds, sgd=optimizer, drop=0.2, |             textcat.update(docs, golds, sgd=optimizer, drop=0.2, | ||||||
|                 losses=losses) |                 losses=losses) | ||||||
|         with textcat.model.use_params(optimizer.averages): |         with textcat.model.use_params(optimizer.averages): | ||||||
|             scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) |             scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) | ||||||
|  | @ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats): | ||||||
|     return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}   |     return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}   | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_data(): | def load_data(limit=0): | ||||||
|     # Partition off part of the train data --- avoid running experiments |     # Partition off part of the train data --- avoid running experiments | ||||||
|     # against test. |     # against test. | ||||||
|     train_data, _ = thinc.extra.datasets.imdb() |     train_data, _ = thinc.extra.datasets.imdb() | ||||||
| 
 | 
 | ||||||
|     random.shuffle(train_data) |     random.shuffle(train_data) | ||||||
|  |     train_data = train_data[-limit:] | ||||||
| 
 | 
 | ||||||
|     texts, labels = zip(*train_data) |     texts, labels = zip(*train_data) | ||||||
|     cats = [(['POSITIVE'] if y else []) for y in labels] |     cats = [(['POSITIVE'] if y else []) for y in labels] | ||||||
|  | @ -86,7 +97,7 @@ def main(model_loc=None): | ||||||
|     textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) |     textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) | ||||||
| 
 | 
 | ||||||
|     print("Load IMDB data") |     print("Load IMDB data") | ||||||
|     (train_texts, train_cats), (dev_texts, dev_cats) = load_data() |     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000) | ||||||
| 
 | 
 | ||||||
|     print("Itn.\tLoss\tP\tR\tF") |     print("Itn.\tLoss\tP\tR\tF") | ||||||
|     progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' |     progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' | ||||||
|  |  | ||||||
							
								
								
									
										30
									
								
								examples/vectors_fast_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								examples/vectors_fast_text.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | ||||||
|  | '''Load vectors for a language trained using FastText | ||||||
|  | 
 | ||||||
|  | https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md | ||||||
|  | ''' | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | import plac | ||||||
|  | import numpy | ||||||
|  | 
 | ||||||
|  | import spacy.language | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def main(vectors_loc): | ||||||
|  |     nlp = spacy.language.Language() | ||||||
|  | 
 | ||||||
|  |     with open(vectors_loc, 'rb') as file_: | ||||||
|  |         header = file_.readline() | ||||||
|  |         nr_row, nr_dim = header.split() | ||||||
|  |         nlp.vocab.clear_vectors(int(nr_dim)) | ||||||
|  |         for line in file_: | ||||||
|  |             line = line.decode('utf8') | ||||||
|  |             pieces = line.split()  | ||||||
|  |             word = pieces[0] | ||||||
|  |             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') | ||||||
|  |             nlp.vocab.set_vector(word, vector) | ||||||
|  |     doc = nlp(u'class colspan') | ||||||
|  |     print(doc[0].similarity(doc[1])) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     plac.call(main) | ||||||
							
								
								
									
										5
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							|  | @ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV) | ||||||
| def env(lang='python2.7'): | def env(lang='python2.7'): | ||||||
|     if path.exists(VENV_DIR): |     if path.exists(VENV_DIR): | ||||||
|         local('rm -rf {env}'.format(env=VENV_DIR)) |         local('rm -rf {env}'.format(env=VENV_DIR)) | ||||||
|  |     local('pip install virtualenv') | ||||||
|     local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) |     local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -32,6 +33,10 @@ def make(): | ||||||
|             local('pip install -r requirements.txt') |             local('pip install -r requirements.txt') | ||||||
|             local('python setup.py build_ext --inplace') |             local('python setup.py build_ext --inplace') | ||||||
| 
 | 
 | ||||||
|  | def sdist(): | ||||||
|  |     with virtualenv(VENV_DIR): | ||||||
|  |         with lcd(path.dirname(__file__)): | ||||||
|  |             local('python setup.py sdist') | ||||||
| 
 | 
 | ||||||
| def clean(): | def clean(): | ||||||
|     with lcd(path.dirname(__file__)): |     with lcd(path.dirname(__file__)): | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| cython<0.24 | cython>=0.24,<0.27.0 | ||||||
| pathlib | pathlib | ||||||
| numpy>=1.7 | numpy>=1.7 | ||||||
| cymem>=1.30,<1.32 | cymem>=1.30,<1.32 | ||||||
| preshed>=1.0.0,<2.0.0 | preshed>=1.0.0,<2.0.0 | ||||||
| thinc>=6.8.0,<6.9.0 | thinc>=6.9.0,<6.10.0 | ||||||
| murmurhash>=0.28,<0.29 | murmurhash>=0.28,<0.29 | ||||||
| plac<1.0.0,>=0.9.6 | plac<1.0.0,>=0.9.6 | ||||||
| six | six | ||||||
|  | @ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0 | ||||||
| regex==2017.4.5 | regex==2017.4.5 | ||||||
| ftfy>=4.4.2,<5.0.0 | ftfy>=4.4.2,<5.0.0 | ||||||
| pytest>=3.0.6,<4.0.0 | pytest>=3.0.6,<4.0.0 | ||||||
| pip>=9.0.0,<10.0.0 |  | ||||||
| mock>=2.0.0,<3.0.0 | mock>=2.0.0,<3.0.0 | ||||||
| msgpack-python | msgpack-python | ||||||
| msgpack-numpy | msgpack-numpy | ||||||
|  | html5lib==1.0b8 | ||||||
|  |  | ||||||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -195,9 +195,8 @@ def setup_package(): | ||||||
|                 'murmurhash>=0.28,<0.29', |                 'murmurhash>=0.28,<0.29', | ||||||
|                 'cymem>=1.30,<1.32', |                 'cymem>=1.30,<1.32', | ||||||
|                 'preshed>=1.0.0,<2.0.0', |                 'preshed>=1.0.0,<2.0.0', | ||||||
|                 'thinc>=6.8.0,<6.9.0', |                 'thinc>=6.9.0,<6.10.0', | ||||||
|                 'plac<1.0.0,>=0.9.6', |                 'plac<1.0.0,>=0.9.6', | ||||||
|                 'pip>=9.0.0,<10.0.0', |  | ||||||
|                 'six', |                 'six', | ||||||
|                 'pathlib', |                 'pathlib', | ||||||
|                 'ujson>=1.35', |                 'ujson>=1.35', | ||||||
|  |  | ||||||
|  | @ -4,11 +4,13 @@ from __future__ import unicode_literals | ||||||
| from .cli.info import info as cli_info | from .cli.info import info as cli_info | ||||||
| from .glossary import explain | from .glossary import explain | ||||||
| from .deprecated import resolve_load_name | from .deprecated import resolve_load_name | ||||||
|  | #from .about import __version__ | ||||||
| from .about import __version__ | from .about import __version__ | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load(name, **overrides): | def load(name, **overrides): | ||||||
|  |     from .deprecated import resolve_load_name | ||||||
|     name = resolve_load_name(name, **overrides) |     name = resolve_load_name(name, **overrides) | ||||||
|     return util.load_model(name, **overrides) |     return util.load_model(name, **overrides) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ if __name__ == '__main__': | ||||||
|     import plac |     import plac | ||||||
|     import sys |     import sys | ||||||
|     from spacy.cli import download, link, info, package, train, convert, model |     from spacy.cli import download, link, info, package, train, convert, model | ||||||
|     from spacy.cli import profile |     from spacy.cli import profile, evaluate | ||||||
|     from spacy.util import prints |     from spacy.util import prints | ||||||
| 
 | 
 | ||||||
|     commands = { |     commands = { | ||||||
|  | @ -15,6 +15,7 @@ if __name__ == '__main__': | ||||||
|         'link': link, |         'link': link, | ||||||
|         'info': info, |         'info': info, | ||||||
|         'train': train, |         'train': train, | ||||||
|  |         'evaluate': evaluate, | ||||||
|         'convert': convert, |         'convert': convert, | ||||||
|         'package': package, |         'package': package, | ||||||
|         'model': model, |         'model': model, | ||||||
|  |  | ||||||
							
								
								
									
										161
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										161
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -1,28 +1,27 @@ | ||||||
| import ujson | import ujson | ||||||
|  | from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||||
|  | from thinc.i2v import HashEmbed, StaticVectors | ||||||
|  | from thinc.t2t import ExtractWindow, ParametricAttention | ||||||
|  | from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool | ||||||
|  | from thinc.misc import Residual | ||||||
|  | from thinc.misc import BatchNorm as BN | ||||||
|  | from thinc.misc import LayerNorm as LN | ||||||
|  | 
 | ||||||
| from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | ||||||
| from thinc.neural import Model, Maxout, Softmax, Affine | from thinc.api import FeatureExtracter, with_getitem | ||||||
| from thinc.neural._classes.hash_embed import HashEmbed | from thinc.api import uniqued, wrap, flatten_add_lengths, noop | ||||||
|  | 
 | ||||||
|  | from thinc.linear.linear import LinearModel | ||||||
| from thinc.neural.ops import NumpyOps, CupyOps | from thinc.neural.ops import NumpyOps, CupyOps | ||||||
| from thinc.neural.util import get_array_module | from thinc.neural.util import get_array_module | ||||||
|  | 
 | ||||||
| import random | import random | ||||||
| import cytoolz | import cytoolz | ||||||
| 
 | 
 | ||||||
| from thinc.neural._classes.convolution import ExtractWindow |  | ||||||
| from thinc.neural._classes.static_vectors import StaticVectors |  | ||||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN |  | ||||||
| from thinc.neural._classes.layernorm import LayerNorm as LN |  | ||||||
| from thinc.neural._classes.resnet import Residual |  | ||||||
| from thinc.neural import ReLu |  | ||||||
| from thinc.neural._classes.selu import SELU |  | ||||||
| from thinc import describe | from thinc import describe | ||||||
| from thinc.describe import Dimension, Synapses, Biases, Gradient | from thinc.describe import Dimension, Synapses, Biases, Gradient | ||||||
| from thinc.neural._classes.affine import _set_dimensions_if_needed | from thinc.neural._classes.affine import _set_dimensions_if_needed | ||||||
| from thinc.api import FeatureExtracter, with_getitem | import thinc.extra.load_nlp | ||||||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool |  | ||||||
| from thinc.neural._classes.attention import ParametricAttention |  | ||||||
| from thinc.linear.linear import LinearModel |  | ||||||
| from thinc.api import uniqued, wrap, flatten_add_lengths |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER | from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER | ||||||
| from .tokens.doc import Doc | from .tokens.doc import Doc | ||||||
|  | @ -31,6 +30,11 @@ from . import util | ||||||
| import numpy | import numpy | ||||||
| import io | import io | ||||||
| 
 | 
 | ||||||
|  | # TODO: Unset this once we don't want to support models previous models. | ||||||
|  | import thinc.neural._classes.layernorm | ||||||
|  | thinc.neural._classes.layernorm.set_compat_six_eight(True) | ||||||
|  | 
 | ||||||
|  | VECTORS_KEY = 'spacy_pretrained_vectors' | ||||||
| 
 | 
 | ||||||
| @layerize | @layerize | ||||||
| def _flatten_add_lengths(seqs, pad=0, drop=0.): | def _flatten_add_lengths(seqs, pad=0, drop=0.): | ||||||
|  | @ -225,33 +229,80 @@ def drop_layer(layer, factor=2.): | ||||||
|     model.predict = layer |     model.predict = layer | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
|  | def link_vectors_to_models(vocab): | ||||||
|  |     vectors = vocab.vectors | ||||||
|  |     ops = Model.ops | ||||||
|  |     for word in vocab: | ||||||
|  |         if word.orth in vectors.key2row: | ||||||
|  |             word.rank = vectors.key2row[word.orth] | ||||||
|  |         else: | ||||||
|  |             word.rank = 0 | ||||||
|  |     data = ops.asarray(vectors.data) | ||||||
|  |     # Set an entry here, so that vectors are accessed by StaticVectors | ||||||
|  |     # (unideal, I know) | ||||||
|  |     thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data | ||||||
| 
 | 
 | ||||||
| def Tok2Vec(width, embed_size, preprocess=None): | def Tok2Vec(width, embed_size, **kwargs): | ||||||
|  |     pretrained_dims = kwargs.get('pretrained_dims', 0) | ||||||
|  |     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) | ||||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] |     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): |     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, | ||||||
|  |                                  '*': reapply}): | ||||||
|         norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') |         norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') | ||||||
|         prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') |         prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') | ||||||
|         suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') |         suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') | ||||||
|         shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') |         shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') | ||||||
|  |         if pretrained_dims is not None and pretrained_dims >= 1: | ||||||
|  |             glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) | ||||||
| 
 | 
 | ||||||
|         embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) |             embed = uniqued( | ||||||
|         tok2vec = ( |                 (glove | norm | prefix | suffix | shape) | ||||||
|             with_flatten( |                 >> LN(Maxout(width, width*5, pieces=3)), column=5) | ||||||
|                 asarray(Model.ops, dtype='uint64') |         else: | ||||||
|                 >> uniqued(embed, column=5) |             embed = uniqued( | ||||||
|                 >> Residual( |                 (norm | prefix | suffix | shape) | ||||||
|                     (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) |                 >> LN(Maxout(width, width*4, pieces=3)), column=5) | ||||||
|                 ) ** 4, pad=4 | 
 | ||||||
|             ) | 
 | ||||||
|  |         convolution = Residual( | ||||||
|  |             ExtractWindow(nW=1) | ||||||
|  |             >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) | ||||||
|         ) |         ) | ||||||
|         if preprocess not in (False, None): | 
 | ||||||
|             tok2vec = preprocess >> tok2vec |         tok2vec = ( | ||||||
|  |             FeatureExtracter(cols) | ||||||
|  |             >> with_flatten( | ||||||
|  |                 embed >> (convolution ** 4), pad=4) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|         # Work around thinc API limitations :(. TODO: Revise in Thinc 7 |         # Work around thinc API limitations :(. TODO: Revise in Thinc 7 | ||||||
|         tok2vec.nO = width |         tok2vec.nO = width | ||||||
|         tok2vec.embed = embed |         tok2vec.embed = embed | ||||||
|     return tok2vec |     return tok2vec | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def reapply(layer, n_times): | ||||||
|  |     def reapply_fwd(X, drop=0.): | ||||||
|  |         backprops = [] | ||||||
|  |         for i in range(n_times): | ||||||
|  |             Y, backprop = layer.begin_update(X, drop=drop) | ||||||
|  |             X = Y | ||||||
|  |             backprops.append(backprop) | ||||||
|  |         def reapply_bwd(dY, sgd=None): | ||||||
|  |             dX = None | ||||||
|  |             for backprop in reversed(backprops): | ||||||
|  |                 dY = backprop(dY, sgd=sgd) | ||||||
|  |                 if dX is None: | ||||||
|  |                     dX = dY | ||||||
|  |                 else: | ||||||
|  |                     dX += dY | ||||||
|  |             return dX | ||||||
|  |         return Y, reapply_bwd | ||||||
|  |     return wrap(reapply_fwd, layer) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def asarray(ops, dtype): | def asarray(ops, dtype): | ||||||
|     def forward(X, drop=0.): |     def forward(X, drop=0.): | ||||||
|         return ops.asarray(X, dtype=dtype), None |         return ops.asarray(X, dtype=dtype), None | ||||||
|  | @ -455,20 +506,25 @@ def getitem(i): | ||||||
|         return X[i], None |         return X[i], None | ||||||
|     return layerize(getitem_fwd) |     return layerize(getitem_fwd) | ||||||
| 
 | 
 | ||||||
| def build_tagger_model(nr_class, token_vector_width, **cfg): | def build_tagger_model(nr_class, **cfg): | ||||||
|     embed_size = util.env_opt('embed_size', 7500) |     embed_size = util.env_opt('embed_size', 7000) | ||||||
|  |     if 'token_vector_width' in cfg: | ||||||
|  |         token_vector_width = cfg['token_vector_width'] | ||||||
|  |     else: | ||||||
|  |         token_vector_width = util.env_opt('token_vector_width', 128) | ||||||
|  |     pretrained_dims = cfg.get('pretrained_dims', 0) | ||||||
|     with Model.define_operators({'>>': chain, '+': add}): |     with Model.define_operators({'>>': chain, '+': add}): | ||||||
|         # Input: (doc, tensor) tuples |         if 'tok2vec' in cfg: | ||||||
|         private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) |             tok2vec = cfg['tok2vec'] | ||||||
| 
 |         else: | ||||||
|  |             tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||||
|  |                               pretrained_dims=pretrained_dims) | ||||||
|         model = ( |         model = ( | ||||||
|             fine_tune(private_tok2vec) |             tok2vec | ||||||
|             >> with_flatten( |             >> with_flatten(Softmax(nr_class, token_vector_width)) | ||||||
|                 Maxout(token_vector_width, token_vector_width) |  | ||||||
|                 >> Softmax(nr_class, token_vector_width) |  | ||||||
|             ) |  | ||||||
|         ) |         ) | ||||||
|     model.nI = None |     model.nI = None | ||||||
|  |     model.tok2vec = tok2vec | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0): | ||||||
| 
 | 
 | ||||||
| def build_text_classifier(nr_class, width=64, **cfg): | def build_text_classifier(nr_class, width=64, **cfg): | ||||||
|     nr_vector = cfg.get('nr_vector', 5000) |     nr_vector = cfg.get('nr_vector', 5000) | ||||||
|  |     pretrained_dims = cfg.get('pretrained_dims', 0) | ||||||
|     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, |     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, | ||||||
|                                  '**': clone}): |                                  '**': clone}): | ||||||
|         if cfg.get('low_data'): |         if cfg.get('low_data'): | ||||||
|  | @ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | ||||||
|                 SpacyVectors |                 SpacyVectors | ||||||
|                 >> flatten_add_lengths |                 >> flatten_add_lengths | ||||||
|                 >> with_getitem(0, |                 >> with_getitem(0, | ||||||
|                     Affine(width, 300) |                     Affine(width, pretrained_dims) | ||||||
|                 ) |                 ) | ||||||
|                 >> ParametricAttention(width) |                 >> ParametricAttention(width) | ||||||
|                 >> Pooling(sum_pool) |                 >> Pooling(sum_pool) | ||||||
|  | @ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg): | ||||||
|             ) |             ) | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         static_vectors = ( |         if pretrained_dims: | ||||||
|             SpacyVectors |             static_vectors = ( | ||||||
|             >> with_flatten(Affine(width, 300)) |                 SpacyVectors | ||||||
|         ) |                 >> with_flatten(Affine(width, pretrained_dims)) | ||||||
| 
 |             ) | ||||||
|         cnn_model = ( |  | ||||||
|             # TODO Make concatenate support lists |             # TODO Make concatenate support lists | ||||||
|             concatenate_lists(trained_vectors, static_vectors) |             vectors = concatenate_lists(trained_vectors, static_vectors) | ||||||
|  |             vectors_width = width*2 | ||||||
|  |         else: | ||||||
|  |             vectors = trained_vectors | ||||||
|  |             vectors_width = width | ||||||
|  |             static_vectors = None | ||||||
|  |         cnn_model = ( | ||||||
|  |             vectors | ||||||
|             >> with_flatten( |             >> with_flatten( | ||||||
|                 LN(Maxout(width, width*2)) |                 LN(Maxout(width, vectors_width)) | ||||||
|                 >> Residual( |                 >> Residual( | ||||||
|                     (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3))) |                     (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) | ||||||
|                 ) ** 2, pad=2 |                 ) ** 2, pad=2 | ||||||
|             ) |             ) | ||||||
|             >> flatten_add_lengths |             >> flatten_add_lengths | ||||||
|  | @ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | ||||||
|             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) |             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) | ||||||
|             >> logistic |             >> logistic | ||||||
|         ) |         ) | ||||||
| 
 |     model.nO = nr_class | ||||||
|     model.lsuv = False |     model.lsuv = False | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3,14 +3,15 @@ | ||||||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||||
| 
 | 
 | ||||||
| __title__ = 'spacy-nightly' | __title__ = 'spacy-nightly' | ||||||
| __version__ = '2.0.0a13' | __version__ = '2.0.0a16' | ||||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||||
| __uri__ = 'https://spacy.io' | __uri__ = 'https://spacy.io' | ||||||
| __author__ = 'Explosion AI' | __author__ = 'Explosion AI' | ||||||
| __email__ = 'contact@explosion.ai' | __email__ = 'contact@explosion.ai' | ||||||
| __license__ = 'MIT' | __license__ = 'MIT' | ||||||
|  | __release__ = True | ||||||
| 
 | 
 | ||||||
| __docs_models__ = 'https://spacy.io/docs/usage/models' | __docs_models__ = 'https://alpha.spacy.io/usage/models' | ||||||
| __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | ||||||
| __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | ||||||
| __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' | __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| # Reserve 64 values for flag features | # Reserve 64 values for flag features | ||||||
| cpdef enum attr_id_t: | cdef enum attr_id_t: | ||||||
|     NULL_ATTR |     NULL_ATTR | ||||||
|     IS_ALPHA |     IS_ALPHA | ||||||
|     IS_ASCII |     IS_ASCII | ||||||
|  |  | ||||||
|  | @ -94,6 +94,7 @@ IDS = { | ||||||
| 
 | 
 | ||||||
| # ATTR IDs, in order of the symbol | # ATTR IDs, in order of the symbol | ||||||
| NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | ||||||
|  | locals().update(IDS) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||||
|  |  | ||||||
|  | @ -4,5 +4,6 @@ from .link import link | ||||||
| from .package import package | from .package import package | ||||||
| from .profile import profile | from .profile import profile | ||||||
| from .train import train | from .train import train | ||||||
|  | from .evaluate import evaluate | ||||||
| from .convert import convert | from .convert import convert | ||||||
| from .model import model | from .model import model | ||||||
|  |  | ||||||
|  | @ -14,7 +14,7 @@ from ..util import prints | ||||||
| CONVERTERS = { | CONVERTERS = { | ||||||
|     '.conllu': conllu2json, |     '.conllu': conllu2json, | ||||||
|     '.conll': conllu2json, |     '.conll': conllu2json, | ||||||
|     '.iob': iob2json |     '.iob': iob2json, | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  | from cytoolz import partition_all, concat | ||||||
| 
 | 
 | ||||||
| from ...compat import json_dumps, path2str | from ...compat import json_dumps, path2str | ||||||
| from ...util import prints | from ...util import prints | ||||||
|  | @ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||||
|     """ |     """ | ||||||
|     Convert IOB files into JSON format for use with train cli. |     Convert IOB files into JSON format for use with train cli. | ||||||
|     """ |     """ | ||||||
|     # TODO: This isn't complete yet -- need to map from IOB to |  | ||||||
|     # BILUO |  | ||||||
|     with input_path.open('r', encoding='utf8') as file_: |     with input_path.open('r', encoding='utf8') as file_: | ||||||
|         docs = read_iob(file_) |         sentences = read_iob(file_) | ||||||
| 
 |     docs = merge_sentences(sentences, n_sents) | ||||||
|     output_filename = input_path.parts[-1].replace(".iob", ".json") |     output_filename = input_path.parts[-1].replace(".iob", ".json") | ||||||
|     output_file = output_path / output_filename |     output_file = output_path / output_filename | ||||||
|     with output_file.open('w', encoding='utf-8') as f: |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|  | @ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||||
|            title="Generated output file %s" % path2str(output_file)) |            title="Generated output file %s" % path2str(output_file)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_iob(file_): | def read_iob(raw_sents): | ||||||
|     sentences = [] |     sentences = [] | ||||||
|     for line in file_: |     for line in raw_sents: | ||||||
|         if not line.strip(): |         if not line.strip(): | ||||||
|             continue |             continue | ||||||
|         tokens = [t.split('|') for t in line.split()] |         tokens = [t.split('|') for t in line.split()] | ||||||
|  | @ -43,3 +42,15 @@ def read_iob(file_): | ||||||
|     paragraphs = [{'sentences': [sent]} for sent in sentences] |     paragraphs = [{'sentences': [sent]} for sent in sentences] | ||||||
|     docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] |     docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] | ||||||
|     return docs |     return docs | ||||||
|  | 
 | ||||||
|  | def merge_sentences(docs, n_sents): | ||||||
|  |     counter = 0 | ||||||
|  |     merged = [] | ||||||
|  |     for group in partition_all(n_sents, docs): | ||||||
|  |         group = list(group) | ||||||
|  |         first = group.pop(0) | ||||||
|  |         to_extend = first['paragraphs'][0]['sentences'] | ||||||
|  |         for sent in group[1:]: | ||||||
|  |             to_extend.extend(sent['paragraphs'][0]['sentences']) | ||||||
|  |         merged.append(first) | ||||||
|  |     return merged | ||||||
|  |  | ||||||
							
								
								
									
										119
									
								
								spacy/cli/evaluate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								spacy/cli/evaluate.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,119 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals, division, print_function | ||||||
|  | 
 | ||||||
|  | import plac | ||||||
|  | import json | ||||||
|  | from collections import defaultdict | ||||||
|  | import cytoolz | ||||||
|  | from pathlib import Path | ||||||
|  | import dill | ||||||
|  | import tqdm | ||||||
|  | from thinc.neural._classes.model import Model | ||||||
|  | from thinc.neural.optimizers import linear_decay | ||||||
|  | from timeit import default_timer as timer | ||||||
|  | import random | ||||||
|  | import numpy.random | ||||||
|  | 
 | ||||||
|  | from ..tokens.doc import Doc | ||||||
|  | from ..scorer import Scorer | ||||||
|  | from ..gold import GoldParse, merge_sents | ||||||
|  | from ..gold import GoldCorpus, minibatch | ||||||
|  | from ..util import prints | ||||||
|  | from .. import util | ||||||
|  | from .. import about | ||||||
|  | from .. import displacy | ||||||
|  | from ..compat import json_dumps | ||||||
|  | 
 | ||||||
|  | random.seed(0) | ||||||
|  | numpy.random.seed(0) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @plac.annotations( | ||||||
|  |     model=("Model name or path", "positional", None, str), | ||||||
|  |     data_path=("Location of JSON-formatted evaluation data", "positional", None, str), | ||||||
|  |     gold_preproc=("Use gold preprocessing", "flag", "G", bool), | ||||||
|  |     gpu_id=("Use GPU", "option", "g", int), | ||||||
|  |     displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), | ||||||
|  |     displacy_limit=("Limit of parses to render as HTML", "option", "dl", int) | ||||||
|  | ) | ||||||
|  | def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, | ||||||
|  |              displacy_path=None, displacy_limit=25): | ||||||
|  |     """ | ||||||
|  |     Evaluate a model. To render a sample of parses in a HTML file, set an output | ||||||
|  |     directory as the displacy_path argument. | ||||||
|  |     """ | ||||||
|  |     util.use_gpu(gpu_id) | ||||||
|  |     util.set_env_log(False) | ||||||
|  |     data_path = util.ensure_path(data_path) | ||||||
|  |     displacy_path = util.ensure_path(displacy_path) | ||||||
|  |     if not data_path.exists(): | ||||||
|  |         prints(data_path, title="Evaluation data not found", exits=1) | ||||||
|  |     if displacy_path and not displacy_path.exists(): | ||||||
|  |         prints(displacy_path, title="Visualization output directory not found", exits=1) | ||||||
|  |     corpus = GoldCorpus(data_path, data_path) | ||||||
|  |     nlp = util.load_model(model) | ||||||
|  |     dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) | ||||||
|  |     begin = timer() | ||||||
|  |     scorer = nlp.evaluate(dev_docs, verbose=False) | ||||||
|  |     end = timer() | ||||||
|  |     nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) | ||||||
|  |     print_results(scorer, time=end - begin, words=nwords, | ||||||
|  |                   wps=nwords / (end - begin)) | ||||||
|  |     if displacy_path: | ||||||
|  |         docs, golds = zip(*dev_docs) | ||||||
|  |         render_deps = 'parser' in nlp.meta.get('pipeline', []) | ||||||
|  |         render_ents = 'ner' in nlp.meta.get('pipeline', []) | ||||||
|  |         render_parses(docs, displacy_path, model_name=model, limit=displacy_limit, | ||||||
|  |                       deps=render_deps, ents=render_ents) | ||||||
|  |         prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True): | ||||||
|  |     docs[0].user_data['title'] = model_name | ||||||
|  |     if ents: | ||||||
|  |         with (output_path / 'entities.html').open('w') as file_: | ||||||
|  |             html = displacy.render(docs[:limit], style='ent', page=True) | ||||||
|  |             file_.write(html) | ||||||
|  |     if deps: | ||||||
|  |         with (output_path / 'parses.html').open('w') as file_: | ||||||
|  |             html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True}) | ||||||
|  |             file_.write(html) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_progress(itn, losses, dev_scores, wps=0.0): | ||||||
|  |     scores = {} | ||||||
|  |     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', | ||||||
|  |                 'ents_p', 'ents_r', 'ents_f', 'wps']: | ||||||
|  |         scores[col] = 0.0 | ||||||
|  |     scores['dep_loss'] = losses.get('parser', 0.0) | ||||||
|  |     scores['ner_loss'] = losses.get('ner', 0.0) | ||||||
|  |     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||||
|  |     scores.update(dev_scores) | ||||||
|  |     scores['wps'] = wps | ||||||
|  |     tpl = '\t'.join(( | ||||||
|  |         '{:d}', | ||||||
|  |         '{dep_loss:.3f}', | ||||||
|  |         '{ner_loss:.3f}', | ||||||
|  |         '{uas:.3f}', | ||||||
|  |         '{ents_p:.3f}', | ||||||
|  |         '{ents_r:.3f}', | ||||||
|  |         '{ents_f:.3f}', | ||||||
|  |         '{tags_acc:.3f}', | ||||||
|  |         '{token_acc:.3f}', | ||||||
|  |         '{wps:.1f}')) | ||||||
|  |     print(tpl.format(itn, **scores)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def print_results(scorer, time, words, wps): | ||||||
|  |     results = { | ||||||
|  |         'Time': '%.2f s' % time, | ||||||
|  |         'Words': words, | ||||||
|  |         'Words/s': '%.0f' % wps, | ||||||
|  |         'TOK': '%.2f' % scorer.token_acc, | ||||||
|  |         'POS': '%.2f' % scorer.tags_acc, | ||||||
|  |         'UAS': '%.2f' % scorer.uas, | ||||||
|  |         'LAS': '%.2f' % scorer.las, | ||||||
|  |         'NER P': '%.2f' % scorer.ents_p, | ||||||
|  |         'NER R': '%.2f' % scorer.ents_r, | ||||||
|  |         'NER F': '%.2f' % scorer.ents_f} | ||||||
|  |     util.print_table(results, title="Results") | ||||||
|  | @ -105,8 +105,11 @@ def generate_pipeline(): | ||||||
|            "parser, ner. For more information, see the docs on processing pipelines.", |            "parser, ner. For more information, see the docs on processing pipelines.", | ||||||
|            title="Enter your model's pipeline components") |            title="Enter your model's pipeline components") | ||||||
|     pipeline = util.get_raw_input("Pipeline components", True) |     pipeline = util.get_raw_input("Pipeline components", True) | ||||||
|     replace = {'True': True, 'False': False} |     subs = {'True': True, 'False': False} | ||||||
|     return replace[pipeline] if pipeline in replace else pipeline.split(', ') |     if pipeline in subs: | ||||||
|  |         return subs[pipeline] | ||||||
|  |     else: | ||||||
|  |         return [p.strip() for p in pipeline.split(',')] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def validate_meta(meta, keys): | def validate_meta(meta, keys): | ||||||
|  |  | ||||||
|  | @ -8,8 +8,11 @@ import cytoolz | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import dill | import dill | ||||||
| import tqdm | import tqdm | ||||||
|  | from thinc.neural._classes.model import Model | ||||||
| from thinc.neural.optimizers import linear_decay | from thinc.neural.optimizers import linear_decay | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
|  | import random | ||||||
|  | import numpy.random | ||||||
| 
 | 
 | ||||||
| from ..tokens.doc import Doc | from ..tokens.doc import Doc | ||||||
| from ..scorer import Scorer | from ..scorer import Scorer | ||||||
|  | @ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents | ||||||
| from ..gold import GoldCorpus, minibatch | from ..gold import GoldCorpus, minibatch | ||||||
| from ..util import prints | from ..util import prints | ||||||
| from .. import util | from .. import util | ||||||
|  | from .. import about | ||||||
| from .. import displacy | from .. import displacy | ||||||
| from ..compat import json_dumps | from ..compat import json_dumps | ||||||
| 
 | 
 | ||||||
|  | random.seed(0) | ||||||
|  | numpy.random.seed(0) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|     lang=("model language", "positional", None, str), |     lang=("model language", "positional", None, str), | ||||||
|  | @ -29,15 +36,17 @@ from ..compat import json_dumps | ||||||
|     n_iter=("number of iterations", "option", "n", int), |     n_iter=("number of iterations", "option", "n", int), | ||||||
|     n_sents=("number of sentences", "option", "ns", int), |     n_sents=("number of sentences", "option", "ns", int), | ||||||
|     use_gpu=("Use GPU", "option", "g", int), |     use_gpu=("Use GPU", "option", "g", int), | ||||||
|     resume=("Whether to resume training", "flag", "R", bool), |     vectors=("Model to load vectors from", "option", "v"), | ||||||
|     no_tagger=("Don't train tagger", "flag", "T", bool), |     no_tagger=("Don't train tagger", "flag", "T", bool), | ||||||
|     no_parser=("Don't train parser", "flag", "P", bool), |     no_parser=("Don't train parser", "flag", "P", bool), | ||||||
|     no_entities=("Don't train NER", "flag", "N", bool), |     no_entities=("Don't train NER", "flag", "N", bool), | ||||||
|     gold_preproc=("Use gold preprocessing", "flag", "G", bool), |     gold_preproc=("Use gold preprocessing", "flag", "G", bool), | ||||||
|  |     version=("Model version", "option", "V", str), | ||||||
|  |     meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) | ||||||
| ) | ) | ||||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||||
|           use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, |           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, | ||||||
|           gold_preproc=False): |           gold_preproc=False, version="0.0.0", meta_path=None): | ||||||
|     """ |     """ | ||||||
|     Train a model. Expects data in spaCy's JSON format. |     Train a model. Expects data in spaCy's JSON format. | ||||||
|     """ |     """ | ||||||
|  | @ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|     output_path = util.ensure_path(output_dir) |     output_path = util.ensure_path(output_dir) | ||||||
|     train_path = util.ensure_path(train_data) |     train_path = util.ensure_path(train_data) | ||||||
|     dev_path = util.ensure_path(dev_data) |     dev_path = util.ensure_path(dev_data) | ||||||
|  |     meta_path = util.ensure_path(meta_path) | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         output_path.mkdir() |         output_path.mkdir() | ||||||
|     if not train_path.exists(): |     if not train_path.exists(): | ||||||
|         prints(train_path, title="Training data not found", exits=1) |         prints(train_path, title="Training data not found", exits=1) | ||||||
|     if dev_path and not dev_path.exists(): |     if dev_path and not dev_path.exists(): | ||||||
|         prints(dev_path, title="Development data not found", exits=1) |         prints(dev_path, title="Development data not found", exits=1) | ||||||
|  |     if meta_path is not None and not meta_path.exists(): | ||||||
|  |         prints(meta_path, title="meta.json not found", exits=1) | ||||||
|  |     meta = util.read_json(meta_path) if meta_path else {} | ||||||
|  |     if not isinstance(meta, dict): | ||||||
|  |         prints("Expected dict but got: {}".format(type(meta)), | ||||||
|  |                title="Not a valid meta.json format", exits=1) | ||||||
| 
 | 
 | ||||||
|     lang_class = util.get_lang_class(lang) |     pipeline = ['tagger', 'parser', 'ner'] | ||||||
| 
 |     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') | ||||||
|     pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] |     if no_parser and 'parser' in pipeline: pipeline.remove('parser') | ||||||
|     if no_tagger and 'tags' in pipeline: pipeline.remove('tags') |     if no_entities and 'ner' in pipeline: pipeline.remove('ner') | ||||||
|     if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') |  | ||||||
|     if no_entities and 'entities' in pipeline: pipeline.remove('entities') |  | ||||||
| 
 | 
 | ||||||
|     # Take dropout and batch size as generators of values -- dropout |     # Take dropout and batch size as generators of values -- dropout | ||||||
|     # starts high and decays sharply, to force the optimizer to explore. |     # starts high and decays sharply, to force the optimizer to explore. | ||||||
|  | @ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|                                   util.env_opt('dropout_to', 0.2), |                                   util.env_opt('dropout_to', 0.2), | ||||||
|                                   util.env_opt('dropout_decay', 0.0)) |                                   util.env_opt('dropout_decay', 0.0)) | ||||||
|     batch_sizes = util.compounding(util.env_opt('batch_from', 1), |     batch_sizes = util.compounding(util.env_opt('batch_from', 1), | ||||||
|                                    util.env_opt('batch_to', 64), |                                    util.env_opt('batch_to', 16), | ||||||
|                                    util.env_opt('batch_compound', 1.001)) |                                    util.env_opt('batch_compound', 1.001)) | ||||||
| 
 |  | ||||||
|     if resume: |  | ||||||
|         prints(output_path / 'model9.pickle', title="Resuming training") |  | ||||||
|         nlp = dill.load((output_path / 'model9.pickle').open('rb')) |  | ||||||
|     else: |  | ||||||
|         nlp = lang_class(pipeline=pipeline) |  | ||||||
|     corpus = GoldCorpus(train_path, dev_path, limit=n_sents) |     corpus = GoldCorpus(train_path, dev_path, limit=n_sents) | ||||||
|     n_train_words = corpus.count_train() |     n_train_words = corpus.count_train() | ||||||
| 
 | 
 | ||||||
|  |     lang_class = util.get_lang_class(lang) | ||||||
|  |     nlp = lang_class(pipeline=pipeline) | ||||||
|  |     if vectors: | ||||||
|  |         util.load_model(vectors, vocab=nlp.vocab) | ||||||
|     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) |     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) | ||||||
|  |     nlp._optimizer = None | ||||||
| 
 | 
 | ||||||
|     print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") |     print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") | ||||||
|     try: |     try: | ||||||
|  |         train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, | ||||||
|  |                                        gold_preproc=gold_preproc, max_length=0) | ||||||
|  |         train_docs = list(train_docs) | ||||||
|         for i in range(n_iter): |         for i in range(n_iter): | ||||||
|             if resume: |  | ||||||
|                 i += 20 |  | ||||||
|             with tqdm.tqdm(total=n_train_words, leave=False) as pbar: |             with tqdm.tqdm(total=n_train_words, leave=False) as pbar: | ||||||
|                 train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, |  | ||||||
|                                                gold_preproc=gold_preproc, max_length=0) |  | ||||||
|                 losses = {} |                 losses = {} | ||||||
|                 for batch in minibatch(train_docs, size=batch_sizes): |                 for batch in minibatch(train_docs, size=batch_sizes): | ||||||
|                     docs, golds = zip(*batch) |                     docs, golds = zip(*batch) | ||||||
|                     nlp.update(docs, golds, sgd=optimizer, |                     nlp.update(docs, golds, sgd=optimizer, | ||||||
|                                drop=next(dropout_rates), losses=losses, |                                drop=next(dropout_rates), losses=losses) | ||||||
|                                update_shared=True) |  | ||||||
|                     pbar.update(sum(len(doc) for doc in docs)) |                     pbar.update(sum(len(doc) for doc in docs)) | ||||||
| 
 | 
 | ||||||
|             with nlp.use_params(optimizer.averages): |             with nlp.use_params(optimizer.averages): | ||||||
|  | @ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||||
|                 nlp_loaded = lang_class(pipeline=pipeline) |                 nlp_loaded = lang_class(pipeline=pipeline) | ||||||
|                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) |                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) | ||||||
|                 scorer = nlp_loaded.evaluate( |                 scorer = nlp_loaded.evaluate( | ||||||
|                             corpus.dev_docs( |                             list(corpus.dev_docs( | ||||||
|                                 nlp_loaded, |                                 nlp_loaded, | ||||||
|                                 gold_preproc=gold_preproc)) |                                 gold_preproc=gold_preproc))) | ||||||
|                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') |                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') | ||||||
|                 with acc_loc.open('w') as file_: |                 with acc_loc.open('w') as file_: | ||||||
|                     file_.write(json_dumps(scorer.scores)) |                     file_.write(json_dumps(scorer.scores)) | ||||||
|  |                 meta_loc = output_path / ('model%d' % i) / 'meta.json' | ||||||
|  |                 meta['accuracy'] = scorer.scores | ||||||
|  |                 meta['lang'] = nlp.lang | ||||||
|  |                 meta['pipeline'] = pipeline | ||||||
|  |                 meta['spacy_version'] = '>=%s' % about.__version__ | ||||||
|  |                 meta.setdefault('name', 'model%d' % i) | ||||||
|  |                 meta.setdefault('version', version) | ||||||
|  | 
 | ||||||
|  |                 with meta_loc.open('w') as file_: | ||||||
|  |                     file_.write(json_dumps(meta)) | ||||||
|                 util.set_env_log(True) |                 util.set_env_log(True) | ||||||
|             print_progress(i, losses, scorer.scores) |             print_progress(i, losses, scorer.scores) | ||||||
|     finally: |     finally: | ||||||
|  | @ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0): | ||||||
|                 'ents_p', 'ents_r', 'ents_f', 'wps']: |                 'ents_p', 'ents_r', 'ents_f', 'wps']: | ||||||
|         scores[col] = 0.0 |         scores[col] = 0.0 | ||||||
|     scores['dep_loss'] = losses.get('parser', 0.0) |     scores['dep_loss'] = losses.get('parser', 0.0) | ||||||
|  |     scores['ner_loss'] = losses.get('ner', 0.0) | ||||||
|     scores['tag_loss'] = losses.get('tagger', 0.0) |     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||||
|     scores.update(dev_scores) |     scores.update(dev_scores) | ||||||
|     scores['wps'] = wps |     scores['wps'] = wps | ||||||
|     tpl = '\t'.join(( |     tpl = '\t'.join(( | ||||||
|         '{:d}', |         '{:d}', | ||||||
|         '{dep_loss:.3f}', |         '{dep_loss:.3f}', | ||||||
|  |         '{ner_loss:.3f}', | ||||||
|         '{uas:.3f}', |         '{uas:.3f}', | ||||||
|         '{ents_p:.3f}', |         '{ents_p:.3f}', | ||||||
|         '{ents_r:.3f}', |         '{ents_r:.3f}', | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ import re | ||||||
| import ujson | import ujson | ||||||
| import random | import random | ||||||
| import cytoolz | import cytoolz | ||||||
|  | import itertools | ||||||
| 
 | 
 | ||||||
| from .syntax import nonproj | from .syntax import nonproj | ||||||
| from .util import ensure_path | from .util import ensure_path | ||||||
|  | @ -146,9 +147,13 @@ def minibatch(items, size=8): | ||||||
|     '''Iterate over batches of items. `size` may be an iterator, |     '''Iterate over batches of items. `size` may be an iterator, | ||||||
|     so that batch-size can vary on each step. |     so that batch-size can vary on each step. | ||||||
|     ''' |     ''' | ||||||
|  |     if isinstance(size, int): | ||||||
|  |         size_ = itertools.repeat(8) | ||||||
|  |     else: | ||||||
|  |         size_ = size | ||||||
|     items = iter(items) |     items = iter(items) | ||||||
|     while True: |     while True: | ||||||
|         batch_size = next(size) #if hasattr(size, '__next__') else size |         batch_size = next(size_) | ||||||
|         batch = list(cytoolz.take(int(batch_size), items)) |         batch = list(cytoolz.take(int(batch_size), items)) | ||||||
|         if len(batch) == 0: |         if len(batch) == 0: | ||||||
|             break |             break | ||||||
|  |  | ||||||
|  | @ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm | ||||||
|           'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' |           'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' | ||||||
|           'TB T G M K %') |           'TB T G M K %') | ||||||
| _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' | _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' | ||||||
| _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' | _punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·' | ||||||
| _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' | _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' | ||||||
| _hyphens = '- – — -- ---' | _hyphens = '- – — -- --- —— ~' | ||||||
| _other_symbols = r'[\p{So}]' | _other_symbols = r'[\p{So}]' | ||||||
| 
 | 
 | ||||||
| UNITS = merge_chars(_units) | UNITS = merge_chars(_units) | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .norm_exceptions import NORM_EXCEPTIONS | from .norm_exceptions import NORM_EXCEPTIONS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
|  | @ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults): | ||||||
|                                          NORM_EXCEPTIONS, BASE_NORMS) |                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||||
| 
 | 
 | ||||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||||
|  |     infixes = tuple(TOKENIZER_INFIXES) | ||||||
|     tag_map = dict(TAG_MAP) |     tag_map = dict(TAG_MAP) | ||||||
|     stop_words = set(STOP_WORDS) |     stop_words = set(STOP_WORDS) | ||||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) |     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||||
|  |  | ||||||
							
								
								
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||||
|  | from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _quotes = QUOTES.replace("'", '') | ||||||
|  | 
 | ||||||
|  | _infixes = (LIST_ELLIPSES + LIST_ICONS + | ||||||
|  |             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), | ||||||
|  |              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), | ||||||
|  |              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||||
|  |              r'(?<=[0-9])-(?=[0-9])']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_INFIXES = _infixes | ||||||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH | ||||||
| from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
| from .lemmatizer import LOOKUP | from .lemmatizer import LOOKUP | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
|  | @ -17,6 +18,7 @@ from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
| class FrenchDefaults(Language.Defaults): | class FrenchDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|  |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'fr' |     lex_attr_getters[LANG] = lambda text: 'fr' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										41
									
								
								spacy/lang/fr/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								spacy/lang/fr/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,41 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...attrs import LIKE_NUM | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _num_words = set(""" | ||||||
|  | zero un deux trois quatre cinq six sept huit neuf dix | ||||||
|  | onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf | ||||||
|  | vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante | ||||||
|  | cent mille mil million milliard billion quadrillion quintillion | ||||||
|  | sextillion septillion octillion nonillion decillion | ||||||
|  | """.split()) | ||||||
|  | 
 | ||||||
|  | _ordinal_words = set(""" | ||||||
|  | premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième | ||||||
|  | onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième | ||||||
|  | vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième | ||||||
|  | centième millième millionnième milliardième billionnième quadrillionnième quintillionnième | ||||||
|  | sextillionnième septillionnième octillionnième nonillionnième decillionnième | ||||||
|  | """.split()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def like_num(text): | ||||||
|  |     # Might require more work? | ||||||
|  |     # See this discussion: https://github.com/explosion/spaCy/pull/1161 | ||||||
|  |     text = text.replace(',', '').replace('.', '') | ||||||
|  |     if text.isdigit(): | ||||||
|  |         return True | ||||||
|  |     if text.count('/') == 1: | ||||||
|  |         num, denom = text.split('/') | ||||||
|  |         if num.isdigit() and denom.isdigit(): | ||||||
|  |             return True | ||||||
|  |     if text in _num_words: | ||||||
|  |         return True | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | LEX_ATTRS = { | ||||||
|  |     LIKE_NUM: like_num | ||||||
|  | } | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
| 
 | 
 | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from ..norm_exceptions import BASE_NORMS | from ..norm_exceptions import BASE_NORMS | ||||||
|  | @ -12,6 +13,7 @@ from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
| class DutchDefaults(Language.Defaults): | class DutchDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|  |     lex_attr_getters.update(LEX_ATTRS) | ||||||
|     lex_attr_getters[LANG] = lambda text: 'nl' |     lex_attr_getters[LANG] = lambda text: 'nl' | ||||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										40
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...attrs import LIKE_NUM | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _num_words = set(""" | ||||||
|  | nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien | ||||||
|  | veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd | ||||||
|  | duizend miljoen miljard biljoen biljard triljoen triljard | ||||||
|  | """.split()) | ||||||
|  | 
 | ||||||
|  | _ordinal_words = set(""" | ||||||
|  | eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde | ||||||
|  | twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste | ||||||
|  | zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste | ||||||
|  | miljardste biljoenste biljardste triljoenste triljardste | ||||||
|  | """.split()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def like_num(text): | ||||||
|  |     # This only does the most basic check for whether a token is a digit | ||||||
|  |     # or matches one of the number words. In order to handle numbers like | ||||||
|  |     # "drieëntwintig", more work is required. | ||||||
|  |     # See this discussion: https://github.com/explosion/spaCy/pull/1177 | ||||||
|  |     text = text.replace(',', '').replace('.', '') | ||||||
|  |     if text.isdigit(): | ||||||
|  |         return True | ||||||
|  |     if text.count('/') == 1: | ||||||
|  |         num, denom = text.split('/') | ||||||
|  |         if num.isdigit() and denom.isdigit(): | ||||||
|  |             return True | ||||||
|  |     if text in _num_words: | ||||||
|  |         return True | ||||||
|  |     return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | LEX_ATTRS = { | ||||||
|  |     LIKE_NUM: like_num | ||||||
|  | } | ||||||
							
								
								
									
										35
									
								
								spacy/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								spacy/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | from .tag_map import TAG_MAP | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | 
 | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from ...tokens import Doc | ||||||
|  | from ..norm_exceptions import BASE_NORMS | ||||||
|  | from ...language import Language | ||||||
|  | from ...attrs import LANG, NORM | ||||||
|  | from ...util import update_exc, add_lookups | ||||||
|  | 
 | ||||||
|  | class ThaiDefaults(Language.Defaults): | ||||||
|  |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|  |     lex_attr_getters[LANG] = lambda text: 'th' | ||||||
|  |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||||
|  |     tag_map = dict(TAG_MAP) | ||||||
|  |     stop_words = set(STOP_WORDS) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Thai(Language): | ||||||
|  | 	lang = 'th' | ||||||
|  | 	Defaults = ThaiDefaults | ||||||
|  | 	def make_doc(self, text): | ||||||
|  | 		try: | ||||||
|  | 			from pythainlp.tokenize import word_tokenize | ||||||
|  | 		except ImportError: | ||||||
|  | 			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " | ||||||
|  | 								"https://github.com/wannaphongcom/pythainlp/") | ||||||
|  | 		words = [x for x in list(word_tokenize(text,"newmm"))] | ||||||
|  | 		return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||||
|  | 
 | ||||||
|  | __all__ = ['Thai'] | ||||||
							
								
								
									
										62
									
								
								spacy/lang/th/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								spacy/lang/th/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,62 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | # data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt | ||||||
|  | # stop words as whitespace-separated list | ||||||
|  | STOP_WORDS = set(""" | ||||||
|  | นี้	นํา	นั้น	นัก	นอกจาก	ทุก	ที่สุด	ที่	ทําให้	ทํา	ทาง	ทั้งนี้	ดัง	ซึ่ง	ช่วง	จาก	จัด	จะ	คือ	ความ	ครั้ง	คง	ขึ้น	ของ | ||||||
|  | ขอ	รับ	ระหว่าง	รวม	ยัง	มี	มาก	มา	พร้อม	พบ	ผ่าน	ผล	บาง	น่า	เปิดเผย	เปิด	เนื่องจาก	เดียวกัน	เดียว	เช่น	เฉพาะ	เข้า	ถ้า | ||||||
|  | ถูก	ถึง	ต้อง	ต่างๆ	ต่าง	ต่อ	ตาม	ตั้งแต่	ตั้ง	ด้าน	ด้วย	อีก	อาจ	ออก	อย่าง	อะไร	อยู่	อยาก	หาก	หลาย	หลังจาก	แต่	เอง	เห็น | ||||||
|  | เลย	เริ่ม	เรา	เมื่อ	เพื่อ	เพราะ	เป็นการ	เป็น	หลัง	หรือ	หนึ่ง	ส่วน	ส่ง	สุด	สําหรับ	ว่า	ลง	ร่วม	ราย	ขณะ	ก่อน	ก็	การ	กับ	กัน | ||||||
|  | กว่า	กล่าว	จึง	ไว้	ไป	ได้	ให้	ใน	โดย	แห่ง	แล้ว	และ	แรก	แบบ	ๆ	ทั้ง	วัน	เขา	เคย	ไม่	อยาก	เกิน	เกินๆ	เกี่ยวกัน	เกี่ยวกับ | ||||||
|  | เกี่ยวข้อง	เกี่ยวเนื่อง	เกี่ยวๆ	เกือบ	เกือบจะ	เกือบๆ	แก	แก่	แก้ไข	ใกล้	ใกล้ๆ	ไกล	ไกลๆ	ขณะเดียวกัน	ขณะใด	ขณะใดๆ	ขณะที่	ขณะนั้น	ขณะนี้	ขณะหนึ่ง	ขวาง | ||||||
|  | ขวางๆ	ขั้น	ใคร	ใคร่	ใคร่จะ	ใครๆ	ง่าย	ง่ายๆ	ไง	จง	จด	จน	จนกระทั่ง	จนกว่า	จนขณะนี้	จนตลอด	จนถึง	จนทั่ว	จนบัดนี้	จนเมื่อ	จนแม้	จนแม้น | ||||||
|  | จรด	จรดกับ	จริง	จริงจัง	จริงๆ	จริงๆจังๆ	จวน	จวนจะ	จวนเจียน	จวบ	ซึ่งก็	ซึ่งก็คือ	ซึ่งกัน	ซึ่งกันและกัน	ซึ่งได้แก่	ซึ่งๆ	ณ	ด้วย	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยประการฉะนี้ | ||||||
|  | ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น	ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดั่ง	ดังกล่าว	ดังกับ	ดั่งกับ	ดังกับว่า	ดั่งกับว่า	ดังเก่า | ||||||
|  | ดั่งเก่า	ดังเคย	ใดๆ	ได้	ได้แก่	ได้แต่	ได้ที่	ได้มา	ได้รับ	ตน	ตนเอง	ตนฯ	ตรง	ตรงๆ	ตลอด	ตลอดกาล	ตลอดกาลนาน	ตลอดจน	ตลอดถึง	ตลอดทั้ง | ||||||
|  | ตลอดทั่ว	ตลอดทั่วถึง	ตลอดทั่วทั้ง	ตลอดปี	ตลอดไป	ตลอดมา	ตลอดระยะเวลา	ตลอดวัน	ตลอดเวลา	ตลอดศก	ต่อ	ต่อกัน	ถึงแก่	ถึงจะ	ถึงบัดนั้น	ถึงบัดนี้ | ||||||
|  | ถึงเมื่อ	ถึงเมื่อใด	ถึงเมื่อไร	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือ	ถือว่า	ถูกต้อง	ถูกๆ	เถอะ	เถิด	ทรง	ทว่า	ทั้งคน	ทั้งตัว	ทั้งที	ทั้งที่	ทั้งนั้น	ทั้งนั้นด้วย	ทั้งนั้นเพราะ | ||||||
|  | นอก	นอกจากที่	นอกจากนั้น	นอกจากนี้	นอกจากว่า	นอกนั้น	นอกเหนือ	นอกเหนือจาก	น้อย	น้อยกว่า	น้อยๆ	นะ	น่ะ	นักๆ	นั่น	นั่นไง	นั่นเป็น	นั่นแหละ | ||||||
|  | นั่นเอง	นั้นๆ	นับ	นับจากนั้น	นับจากนี้	นับตั้งแต่	นับแต่	นับแต่ที่	นับแต่นั้น	เป็นต้น	เป็นต้นไป	เป็นต้นมา	เป็นแต่	เป็นแต่เพียง	เป็นที	เป็นที่	เป็นที่สุด	เป็นเพราะ | ||||||
|  | เป็นเพราะว่า	เป็นเพียง	เป็นเพียงว่า	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอันว่า	เป็นอันๆ	เป็นอาทิ	เป็นๆ	เปลี่ยน	เปลี่ยนแปลง	เปิด	เปิดเผย	ไป่	ผ่าน	ผ่านๆ | ||||||
|  | ผิด	ผิดๆ	ผู้	เพียงเพื่อ	เพียงไร	เพียงไหน	เพื่อที่	เพื่อที่จะ	เพื่อว่า	เพื่อให้	ภาค	ภาคฯ	ภาย	ภายใต้	ภายนอก	ภายใน	ภายภาค	ภายภาคหน้า	ภายหน้า	ภายหลัง | ||||||
|  | มอง	มองว่า	มัก	มักจะ	มัน	มันๆ	มั้ย	มั้ยนะ	มั้ยนั่น	มั้ยเนี่ย	มั้ยล่ะ	ยืนนาน	ยืนยง	ยืนยัน	ยืนยาว	เยอะ	เยอะแยะ	เยอะๆ	แยะ	แยะๆ	รวด	รวดเร็ว	ร่วม	รวมกัน	ร่วมกัน | ||||||
|  | รวมด้วย	ร่วมด้วย	รวมถึง	รวมทั้ง	ร่วมมือ	รวมๆ	ระยะ	ระยะๆ	ระหว่าง	รับรอง	รึ	รึว่า	รือ	รือว่า	สิ้นกาลนาน	สืบเนื่อง	สุดๆ	สู่	สูง	สูงกว่า	สูงส่ง	สูงสุด	สูงๆ	เสมือนกับ | ||||||
|  | เสมือนว่า	เสร็จ	เสร็จกัน	เสร็จแล้ว	เสร็จสมบูรณ์	เสร็จสิ้น	เสีย	เสียก่อน	เสียจน	เสียจนกระทั่ง	เสียจนถึง	เสียด้วย	เสียนั่น	เสียนั่นเอง	เสียนี่	เสียนี่กระไร	เสียยิ่ง | ||||||
|  | เสียยิ่งนัก	เสียแล้ว	ใหญ่ๆ	ให้ดี	ให้แด่	ให้ไป	ใหม่	ให้มา	ใหม่ๆ	ไหน	ไหนๆ	อดีต	อนึ่ง	อย่าง	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น | ||||||
|  | อย่างนี้	อย่างโน้น	ก็คือ	ก็แค่	ก็จะ	ก็ดี	ก็ได้	ก็ต่อเมื่อ	ก็ตาม	ก็ตามแต่	ก็ตามที	ก็แล้วแต่	กระทั่ง	กระทำ	กระนั้น	กระผม	กลับ	กล่าวคือ	กลุ่ม	กลุ่มก้อน | ||||||
|  | กลุ่มๆ	กว้าง	กว้างขวาง	กว้างๆ	ก่อนหน้า	ก่อนหน้านี้	ก่อนๆ	กันดีกว่า	กันดีไหม	กันเถอะ	กันนะ	กันและกัน	กันไหม	กันเอง	กำลัง	กำลังจะ	กำหนด	กู	เก็บ | ||||||
|  | เกิด	เกี่ยวข้อง	แก่	แก้ไข	ใกล้	ใกล้ๆ	ข้า	ข้าง	ข้างเคียง	ข้างต้น	ข้างบน	ข้างล่าง	ข้างๆ	ขาด	ข้าพเจ้า	ข้าฯ	เข้าใจ	เขียน	คงจะ	คงอยู่	ครบ	ครบครัน	ครบถ้วน | ||||||
|  | ครั้งกระนั้น	ครั้งก่อน	ครั้งครา	ครั้งคราว	ครั้งใด	ครั้งที่	ครั้งนั้น	ครั้งนี้	ครั้งละ	ครั้งหนึ่ง	ครั้งหลัง	ครั้งหลังสุด	ครั้งไหน	ครั้งๆ	ครัน	ครับ	ครา	คราใด	คราที่	ครานั้น	ครานี้	คราหนึ่ง | ||||||
|  | คราไหน	คราว	คราวก่อน	คราวใด	คราวที่	คราวนั้น	คราวนี้	คราวโน้น	คราวละ	คราวหน้า	คราวหนึ่ง	คราวหลัง	คราวไหน	คราวๆ	คล้าย	คล้ายกัน	คล้ายกันกับ | ||||||
|  | คล้ายกับ	คล้ายกับว่า	คล้ายว่า	ควร	ค่อน	ค่อนข้าง	ค่อนข้างจะ	ค่อยไปทาง	ค่อนมาทาง	ค่อย	ค่อยๆ	คะ	ค่ะ	คำ	คิด	คิดว่า	คุณ	คุณๆ | ||||||
|  | เคยๆ	แค่	แค่จะ	แค่นั้น	แค่นี้	แค่เพียง	แค่ว่า	แค่ไหน	ใคร่	ใคร่จะ	ง่าย	ง่ายๆ	จนกว่า	จนแม้	จนแม้น	จังๆ	จวบกับ	จวบจน	จ้ะ	จ๊ะ	จะได้	จัง	จัดการ	จัดงาน	จัดแจง | ||||||
|  | จัดตั้ง	จัดทำ	จัดหา	จัดให้	จับ	จ้า	จ๋า	จากนั้น	จากนี้ 	จากนี้ไป	จำ	จำเป็น 	จำพวก	จึงจะ	จึงเป็น	จู่ๆ	ฉะนั้น	ฉะนี้	ฉัน	เฉกเช่น	เฉย	เฉยๆ	ไฉน	ช่วงก่อน | ||||||
|  | ช่วงต่อไป	ช่วงถัดไป	ช่วงท้าย	ช่วงที่	ช่วงนั้น	ช่วงนี้	ช่วงระหว่าง	ช่วงแรก	ช่วงหน้า	ช่วงหลัง	ช่วงๆ	ช่วย	ช้า	ช้านาน	ชาว	ช้าๆ	เช่นก่อน	เช่นกัน	เช่นเคย | ||||||
|  | เช่นดัง	เช่นดังก่อน	เช่นดังเก่า	เช่นดังที่	เช่นดังว่า	เช่นเดียวกัน	เช่นเดียวกับ	เช่นใด	เช่นที่	เช่นที่เคย	เช่นที่ว่า	เช่นนั้น	เช่นนั้นเอง	เช่นนี้	เช่นเมื่อ	เช่นไร	เชื่อ | ||||||
|  | เชื่อถือ	เชื่อมั่น	เชื่อว่า	ใช่	ใช่ไหม	ใช้	ซะ	ซะก่อน	ซะจน	ซะจนกระทั่ง	ซะจนถึง	ซึ่งได้แก่	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น | ||||||
|  | ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดังกล่าว	ดังกับว่า	ดั่งกับว่า	ดังเก่า	ดั่งเก่า	ดั่งเคย	ต่างก็	ต่างหาก	ตามด้วย	ตามแต่	ตามที่ | ||||||
|  | ตามๆ	เต็มไปด้วย	เต็มไปหมด	เต็มๆ	แต่ก็	แต่ก่อน	แต่จะ	แต่เดิม	แต่ต้อง	แต่ถ้า	แต่ทว่า	แต่ที่	แต่นั้น	แต่เพียง	แต่เมื่อ	แต่ไร	แต่ละ	แต่ว่า	แต่ไหน	แต่อย่างใด	โต | ||||||
|  | โตๆ	ใต้	ถ้าจะ	ถ้าหาก	ถึงแก่	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือว่า	ถูกต้อง	ทว่า	ทั้งนั้นด้วย	ทั้งปวง	ทั้งเป็น	ทั้งมวล	ทั้งสิ้น	ทั้งหมด	ทั้งหลาย	ทั้งๆ	ทัน | ||||||
|  | ทันใดนั้น	ทันที	ทันทีทันใด	ทั่ว	ทำไม	ทำไร	ทำให้	ทำๆ	ที	ที่จริง	ที่ซึ่ง	ทีเดียว	ทีใด	ที่ใด	ที่ได้	ทีเถอะ	ที่แท้	ที่แท้จริง	ที่นั้น	ที่นี้	ทีไร	ทีละ	ที่ละ | ||||||
|  | ที่แล้ว	ที่ว่า	ที่แห่งนั้น	ที่ไหน	ทีๆ	ที่ๆ	ทุกคน	ทุกครั้ง	ทุกครา	ทุกคราว	ทุกชิ้น	ทุกตัว	ทุกทาง	ทุกที	ทุกที่	ทุกเมื่อ	ทุกวัน	ทุกวันนี้	ทุกสิ่ง	ทุกหน	ทุกแห่ง	ทุกอย่าง | ||||||
|  | ทุกอัน	ทุกๆ	เท่า	เท่ากัน	เท่ากับ	เท่าใด	เท่าที่	เท่านั้น	เท่านี้	เท่าไร	เท่าไหร่	แท้	แท้จริง	เธอ	นอกจากว่า	น้อย	น้อยกว่า	น้อยๆ	น่ะ	นั้นไว	นับแต่นี้	นาง | ||||||
|  | นางสาว	น่าจะ	นาน	นานๆ	นาย	นำ	นำพา	นำมา	นิด	นิดหน่อย	นิดๆ	นี่	นี่ไง	นี่นา	นี่แน่ะ	นี่แหละ	นี้แหล่	นี่เอง	นี้เอง	นู่น	นู้น	เน้น	เนี่ย | ||||||
|  | เนี่ยเอง	ในช่วง	ในที่	ในเมื่อ	ในระหว่าง	บน	บอก	บอกแล้ว	บอกว่า	บ่อย	บ่อยกว่า	บ่อยครั้ง	บ่อยๆ	บัดดล	บัดเดี๋ยวนี้	บัดนั้น	บัดนี้	บ้าง	บางกว่า | ||||||
|  | บางขณะ	บางครั้ง	บางครา	บางคราว	บางที	บางที่	บางแห่ง	บางๆ	ปฏิบัติ	ประกอบ	ประการ	ประการฉะนี้	ประการใด	ประการหนึ่ง	ประมาณ	ประสบ	ปรับ | ||||||
|  | ปรากฏ	ปรากฏว่า	ปัจจุบัน	ปิด	เป็นด้วย	เป็นดัง	เป็นต้น	เป็นแต่	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอาทิ	ผ่านๆ	ผู้	ผู้ใด	เผื่อ	เผื่อจะ	เผื่อที่	เผื่อว่า	ฝ่าย | ||||||
|  | ฝ่ายใด	พบว่า	พยายาม	พร้อมกัน	พร้อมกับ	พร้อมด้วย	พร้อมทั้ง	พร้อมที่	พร้อมเพียง	พวก	พวกกัน	พวกกู	พวกแก	พวกเขา	พวกคุณ	พวกฉัน	พวกท่าน | ||||||
|  | พวกที่	พวกเธอ	พวกนั้น	พวกนี้	พวกนู้น	พวกโน้น	พวกมัน	พวกมึง	พอ	พอกัน	พอควร	พอจะ	พอดี	พอตัว	พอที	พอที่	พอเพียง	พอแล้ว	พอสม	พอสมควร | ||||||
|  | พอเหมาะ	พอๆ	พา	พึง	พึ่ง	พื้นๆ	พูด	เพราะฉะนั้น	เพราะว่า	เพิ่ง	เพิ่งจะ	เพิ่ม	เพิ่มเติม	เพียง	เพียงแค่	เพียงใด	เพียงแต่	เพียงพอ	เพียงเพราะ | ||||||
|  | เพื่อว่า	เพื่อให้	ภายใต้	มองว่า	มั๊ย	มากกว่า	มากมาย	มิ	มิฉะนั้น	มิใช่	มิได้	มีแต่	มึง	มุ่ง	มุ่งเน้น	มุ่งหมาย	เมื่อก่อน	เมื่อครั้ง	เมื่อครั้งก่อน | ||||||
|  | เมื่อคราวก่อน	เมื่อคราวที่	เมื่อคราว	เมื่อคืน	เมื่อเช้า	เมื่อใด	เมื่อนั้น	เมื่อนี้	เมื่อเย็น	เมื่อไร	เมื่อวันวาน	เมื่อวาน	เมื่อไหร่	แม้	แม้กระทั่ง	แม้แต่	แม้นว่า	แม้ว่า | ||||||
|  | ไม่ค่อย	ไม่ค่อยจะ	ไม่ค่อยเป็น	ไม่ใช่	ไม่เป็นไร	ไม่ว่า	ยก	ยกให้	ยอม	ยอมรับ	ย่อม	ย่อย	ยังคง	ยังงั้น	ยังงี้	ยังโง้น	ยังไง	ยังจะ	ยังแต่	ยาก | ||||||
|  | ยาว	ยาวนาน	ยิ่ง	ยิ่งกว่า	ยิ่งขึ้น	ยิ่งขึ้นไป	ยิ่งจน	ยิ่งจะ	ยิ่งนัก	ยิ่งเมื่อ	ยิ่งแล้ว	ยิ่งใหญ่	ร่วมกัน	รวมด้วย	ร่วมด้วย	รือว่า	เร็ว	เร็วๆ	เราๆ	เรียก	เรียบ	เรื่อย | ||||||
|  | เรื่อยๆ	ไร	ล้วน	ล้วนจน	ล้วนแต่	ละ	ล่าสุด	เล็ก	เล็กน้อย	เล็กๆ	เล่าว่า	แล้วกัน	แล้วแต่	แล้วเสร็จ	วันใด	วันนั้น	วันนี้	วันไหน	สบาย	สมัย	สมัยก่อน | ||||||
|  | สมัยนั้น	สมัยนี้	สมัยโน้น	ส่วนเกิน	ส่วนด้อย	ส่วนดี	ส่วนใด	ส่วนที่	ส่วนน้อย	ส่วนนั้น	ส่วนมาก	ส่วนใหญ่	สั้น	สั้นๆ	สามารถ	สำคัญ	สิ่ง | ||||||
|  | สิ่งใด	สิ่งนั้น	สิ่งนี้	สิ่งไหน	สิ้น	เสร็จแล้ว	เสียด้วย	เสียแล้ว	แสดง	แสดงว่า	หน	หนอ	หนอย	หน่อย	หมด	หมดกัน	หมดสิ้น	หรือไง	หรือเปล่า	หรือไม่	หรือยัง | ||||||
|  | หรือไร	หากแม้	หากแม้น	หากแม้นว่า	หากว่า	หาความ	หาใช่	หารือ	เหตุ	เหตุผล	เหตุนั้น	เหตุนี้	เหตุไร	เห็นแก่	เห็นควร	เห็นจะ	เห็นว่า	เหลือ	เหลือเกิน	เหล่า | ||||||
|  | เหล่านั้น	เหล่านี้	แห่งใด	แห่งนั้น	แห่งนี้	แห่งโน้น	แห่งไหน	แหละ	ให้แก่	ใหญ่	ใหญ่โต	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น	อย่างนี้ | ||||||
|  | อย่างโน้น	อย่างมาก	อย่างยิ่ง	อย่างไร	อย่างไรก็	อย่างไรก็ได้	อย่างไรเสีย	อย่างละ	อย่างหนึ่ง	อย่างไหน	อย่างๆ	อัน	อันจะ	อันใด	อันได้แก่	อันที่ | ||||||
|  | อันที่จริง	อันที่จะ	อันเนื่องมาจาก	อันละ	อันไหน	อันๆ	อาจจะ	อาจเป็น	อาจเป็นด้วย	อื่น	อื่นๆ	เอ็ง	เอา	ฯ	ฯล	ฯลฯ | ||||||
|  | """.split()) | ||||||
							
								
								
									
										81
									
								
								spacy/lang/th/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								spacy/lang/th/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,81 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...symbols import * | ||||||
|  | 
 | ||||||
|  | TAG_MAP = { | ||||||
|  |     #NOUN | ||||||
|  |     "NOUN":     {POS: NOUN}, | ||||||
|  |     "NCMN":     {POS: NOUN}, | ||||||
|  |     "NTTL":     {POS: NOUN}, | ||||||
|  |     "CNIT":     {POS: NOUN}, | ||||||
|  |     "CLTV":     {POS: NOUN}, | ||||||
|  |     "CMTR":     {POS: NOUN}, | ||||||
|  |     "CFQC":     {POS: NOUN}, | ||||||
|  |     "CVBL":     {POS: NOUN}, | ||||||
|  |     #PRON | ||||||
|  |     "PRON":     {POS: PRON}, | ||||||
|  |     "NPRP":     {POS: PRON}, | ||||||
|  |     # ADJ | ||||||
|  |     "ADJ":      {POS: ADJ}, | ||||||
|  |     "NONM":      {POS: ADJ}, | ||||||
|  |     "VATT":      {POS: ADJ}, | ||||||
|  |     "DONM":      {POS: ADJ}, | ||||||
|  |     # ADV | ||||||
|  |     "ADV":      {POS: ADV}, | ||||||
|  |     "ADVN":      {POS: ADV}, | ||||||
|  |     "ADVI":      {POS: ADV}, | ||||||
|  |     "ADVP":      {POS: ADV}, | ||||||
|  |     "ADVS":      {POS: ADV}, | ||||||
|  | 	# INT | ||||||
|  |     "INT":      {POS: INTJ}, | ||||||
|  |     # PRON | ||||||
|  |     "PROPN":    {POS: PROPN}, | ||||||
|  |     "PPRS":    {POS: PROPN}, | ||||||
|  |     "PDMN":    {POS: PROPN}, | ||||||
|  |     "PNTR":    {POS: PROPN}, | ||||||
|  |     # DET | ||||||
|  |     "DET":      {POS: DET}, | ||||||
|  |     "DDAN":      {POS: DET}, | ||||||
|  |     "DDAC":      {POS: DET}, | ||||||
|  |     "DDBQ":      {POS: DET}, | ||||||
|  |     "DDAQ":      {POS: DET}, | ||||||
|  |     "DIAC":      {POS: DET}, | ||||||
|  |     "DIBQ":      {POS: DET}, | ||||||
|  |     "DIAQ":      {POS: DET}, | ||||||
|  |     "DCNM":      {POS: DET}, | ||||||
|  |     # NUM | ||||||
|  |     "NUM":      {POS: NUM}, | ||||||
|  |     "NCNM":      {POS: NUM}, | ||||||
|  |     "NLBL":      {POS: NUM}, | ||||||
|  |     "DCNM":      {POS: NUM}, | ||||||
|  | 	# AUX | ||||||
|  |     "AUX":      {POS: AUX}, | ||||||
|  |     "XVBM":      {POS: AUX}, | ||||||
|  |     "XVAM":      {POS: AUX}, | ||||||
|  |     "XVMM":      {POS: AUX}, | ||||||
|  |     "XVBB":      {POS: AUX}, | ||||||
|  |     "XVAE":      {POS: AUX}, | ||||||
|  | 	# ADP | ||||||
|  |     "ADP":      {POS: ADP}, | ||||||
|  |     "RPRE":      {POS: ADP}, | ||||||
|  |     # CCONJ | ||||||
|  |     "CCONJ":    {POS: CCONJ}, | ||||||
|  |     "JCRG":    {POS: CCONJ}, | ||||||
|  | 	# SCONJ | ||||||
|  |     "SCONJ":    {POS: SCONJ}, | ||||||
|  |     "PREL":    {POS: SCONJ}, | ||||||
|  |     "JSBR":    {POS: SCONJ}, | ||||||
|  |     "JCMP":    {POS: SCONJ}, | ||||||
|  |     # PART | ||||||
|  |     "PART":    {POS: PART}, | ||||||
|  |     "FIXN":    {POS: PART}, | ||||||
|  |     "FIXV":    {POS: PART}, | ||||||
|  |     "EAFF":    {POS: PART}, | ||||||
|  |     "AITT":    {POS: PART}, | ||||||
|  |     "NEG":    {POS: PART}, | ||||||
|  |     # PUNCT | ||||||
|  |     "PUNCT":    {POS: PUNCT}, | ||||||
|  |     "PUNC":    {POS: PUNCT} | ||||||
|  | } | ||||||
							
								
								
									
										43
									
								
								spacy/lang/th/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								spacy/lang/th/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,43 @@ | ||||||
|  | # encoding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...symbols import * | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = { | ||||||
|  |     "ม.ค.": [ | ||||||
|  |         {ORTH: "ม.ค.", LEMMA: "มกราคม"} | ||||||
|  |     ], | ||||||
|  |     "ก.พ.": [ | ||||||
|  |         {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} | ||||||
|  |     ], | ||||||
|  |     "มี.ค.": [ | ||||||
|  |         {ORTH: "มี.ค.", LEMMA: "มีนาคม"} | ||||||
|  |     ], | ||||||
|  |     "เม.ย.": [ | ||||||
|  |         {ORTH: "เม.ย.", LEMMA: "เมษายน"} | ||||||
|  |     ], | ||||||
|  |     "พ.ค.": [ | ||||||
|  |         {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} | ||||||
|  |     ], | ||||||
|  |     "มิ.ย.": [ | ||||||
|  |         {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} | ||||||
|  |     ], | ||||||
|  |     "ก.ค.": [ | ||||||
|  |         {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} | ||||||
|  |     ], | ||||||
|  |     "ส.ค.": [ | ||||||
|  |         {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} | ||||||
|  |     ], | ||||||
|  |     "ก.ย.": [ | ||||||
|  |         {ORTH: "ก.ย.", LEMMA: "กันยายน"} | ||||||
|  |     ], | ||||||
|  |     "ต.ค.": [ | ||||||
|  |         {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} | ||||||
|  |     ], | ||||||
|  |     "พ.ย.": [ | ||||||
|  |         {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} | ||||||
|  |     ], | ||||||
|  |     "ธ.ค.": [ | ||||||
|  |         {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} | ||||||
|  |     ] | ||||||
|  | } | ||||||
|  | @ -14,8 +14,8 @@ class Chinese(Language): | ||||||
|         except ImportError: |         except ImportError: | ||||||
|             raise ImportError("The Chinese tokenizer requires the Jieba library: " |             raise ImportError("The Chinese tokenizer requires the Jieba library: " | ||||||
|                               "https://github.com/fxsjy/jieba") |                               "https://github.com/fxsjy/jieba") | ||||||
|         words = list(jieba.cut(text, cut_all=True)) |         words = list(jieba.cut(text, cut_all=False)) | ||||||
|         words=[x for x in words if x] |         words = [x for x in words if x] | ||||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) |         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP | ||||||
| from .lang.lex_attrs import LEX_ATTRS | from .lang.lex_attrs import LEX_ATTRS | ||||||
| from . import util | from . import util | ||||||
| from .scorer import Scorer | from .scorer import Scorer | ||||||
|  | from ._ml import link_vectors_to_models | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BaseDefaults(object): | class BaseDefaults(object): | ||||||
|  | @ -278,8 +279,7 @@ class Language(object): | ||||||
|     def make_doc(self, text): |     def make_doc(self, text): | ||||||
|         return self.tokenizer(text) |         return self.tokenizer(text) | ||||||
| 
 | 
 | ||||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None, |     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||||
|             update_shared=False): |  | ||||||
|         """Update the models in the pipeline. |         """Update the models in the pipeline. | ||||||
| 
 | 
 | ||||||
|         docs (iterable): A batch of `Doc` objects. |         docs (iterable): A batch of `Doc` objects. | ||||||
|  | @ -303,32 +303,17 @@ class Language(object): | ||||||
|             if self._optimizer is None: |             if self._optimizer is None: | ||||||
|                 self._optimizer = Adam(Model.ops, 0.001) |                 self._optimizer = Adam(Model.ops, 0.001) | ||||||
|             sgd = self._optimizer |             sgd = self._optimizer | ||||||
|         tok2vec = self.pipeline[0] |  | ||||||
|         feats = tok2vec.doc2feats(docs) |  | ||||||
|         grads = {} |         grads = {} | ||||||
|         def get_grads(W, dW, key=None): |         def get_grads(W, dW, key=None): | ||||||
|             grads[key] = (W, dW) |             grads[key] = (W, dW) | ||||||
|         pipes = list(self.pipeline[1:]) |         pipes = list(self.pipeline) | ||||||
|         random.shuffle(pipes) |         random.shuffle(pipes) | ||||||
|         tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) |  | ||||||
|         all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] |  | ||||||
|         for proc in pipes: |         for proc in pipes: | ||||||
|             if not hasattr(proc, 'update'): |             if not hasattr(proc, 'update'): | ||||||
|                 continue |                 continue | ||||||
|             d_tokvecses = proc.update((docs, tokvecses), golds, |             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||||
|                                       drop=drop, sgd=get_grads, losses=losses) |  | ||||||
|             if update_shared and d_tokvecses is not None: |  | ||||||
|                 for i, d_tv in enumerate(d_tokvecses): |  | ||||||
|                     all_d_tokvecses[i] += d_tv |  | ||||||
|         if update_shared and bp_tokvecses is not None: |  | ||||||
|             bp_tokvecses(all_d_tokvecses, sgd=sgd) |  | ||||||
|         for key, (W, dW) in grads.items(): |         for key, (W, dW) in grads.items(): | ||||||
|             sgd(W, dW, key=key) |             sgd(W, dW, key=key) | ||||||
|         # Clear the tensor variable, to free GPU memory. |  | ||||||
|         # If we don't do this, the memory leak gets pretty |  | ||||||
|         # bad, because we may be holding part of a batch. |  | ||||||
|         for doc in docs: |  | ||||||
|             doc.tensor = None |  | ||||||
| 
 | 
 | ||||||
|     def preprocess_gold(self, docs_golds): |     def preprocess_gold(self, docs_golds): | ||||||
|         """Can be called before training to pre-process gold data. By default, |         """Can be called before training to pre-process gold data. By default, | ||||||
|  | @ -343,36 +328,49 @@ class Language(object): | ||||||
|         for doc, gold in docs_golds: |         for doc, gold in docs_golds: | ||||||
|             yield doc, gold |             yield doc, gold | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, get_gold_tuples, **cfg): |     def resume_training(self, **cfg): | ||||||
|  |         if cfg.get('device', -1) >= 0: | ||||||
|  |             device = util.use_gpu(cfg['device']) | ||||||
|  |             if self.vocab.vectors.data.shape[1] >= 1: | ||||||
|  |                 self.vocab.vectors.data = Model.ops.asarray( | ||||||
|  |                     self.vocab.vectors.data) | ||||||
|  |         else: | ||||||
|  |             device = None | ||||||
|  |         learn_rate = util.env_opt('learn_rate', 0.001) | ||||||
|  |         beta1 = util.env_opt('optimizer_B1', 0.9) | ||||||
|  |         beta2 = util.env_opt('optimizer_B2', 0.999) | ||||||
|  |         eps = util.env_opt('optimizer_eps', 1e-08) | ||||||
|  |         L2 = util.env_opt('L2_penalty', 1e-6) | ||||||
|  |         max_grad_norm = util.env_opt('grad_norm_clip', 1.) | ||||||
|  |         self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, | ||||||
|  |                               beta2=beta2, eps=eps) | ||||||
|  |         self._optimizer.max_grad_norm = max_grad_norm | ||||||
|  |         self._optimizer.device = device | ||||||
|  |         return self._optimizer | ||||||
|  | 
 | ||||||
|  |     def begin_training(self, get_gold_tuples=None, **cfg): | ||||||
|         """Allocate models, pre-process training data and acquire a trainer and |         """Allocate models, pre-process training data and acquire a trainer and | ||||||
|         optimizer. Used as a contextmanager. |         optimizer. Used as a contextmanager. | ||||||
| 
 | 
 | ||||||
|         gold_tuples (iterable): Gold-standard training data. |         get_gold_tuples (function): Function returning gold data | ||||||
|         **cfg: Config parameters. |         **cfg: Config parameters. | ||||||
|         YIELDS (tuple): A trainer and an optimizer. |         returns: An optimizer | ||||||
| 
 |  | ||||||
|         EXAMPLE: |  | ||||||
|             >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): |  | ||||||
|             >>>    for epoch in trainer.epochs(gold): |  | ||||||
|             >>>        for docs, golds in epoch: |  | ||||||
|             >>>            state = nlp.update(docs, golds, sgd=optimizer) |  | ||||||
|         """ |         """ | ||||||
|         if self.parser: |  | ||||||
|             self.pipeline.append(NeuralLabeller(self.vocab)) |  | ||||||
|         # Populate vocab |         # Populate vocab | ||||||
|         for _, annots_brackets in get_gold_tuples(): |         if get_gold_tuples is not None: | ||||||
|             for annots, _ in annots_brackets: |             for _, annots_brackets in get_gold_tuples(): | ||||||
|                 for word in annots[1]: |                 for annots, _ in annots_brackets: | ||||||
|                     _ = self.vocab[word] |                     for word in annots[1]: | ||||||
|  |                         _ = self.vocab[word] | ||||||
|         contexts = [] |         contexts = [] | ||||||
|         if cfg.get('device', -1) >= 0: |         if cfg.get('device', -1) >= 0: | ||||||
|             import cupy.cuda.device |             device = util.use_gpu(cfg['device']) | ||||||
|             device = cupy.cuda.device.Device(cfg['device']) |             if self.vocab.vectors.data.shape[1] >= 1: | ||||||
|             device.use() |                 self.vocab.vectors.data = Model.ops.asarray( | ||||||
|             Model.ops = CupyOps() |                     self.vocab.vectors.data) | ||||||
|             Model.Ops = CupyOps |  | ||||||
|         else: |         else: | ||||||
|             device = None |             device = None | ||||||
|  |         link_vectors_to_models(self.vocab) | ||||||
|         for proc in self.pipeline: |         for proc in self.pipeline: | ||||||
|             if hasattr(proc, 'begin_training'): |             if hasattr(proc, 'begin_training'): | ||||||
|                 context = proc.begin_training(get_gold_tuples(), |                 context = proc.begin_training(get_gold_tuples(), | ||||||
|  | @ -390,7 +388,7 @@ class Language(object): | ||||||
|         self._optimizer.device = device |         self._optimizer.device = device | ||||||
|         return self._optimizer |         return self._optimizer | ||||||
| 
 | 
 | ||||||
|     def evaluate(self, docs_golds): |     def evaluate(self, docs_golds, verbose=False): | ||||||
|         scorer = Scorer() |         scorer = Scorer() | ||||||
|         docs, golds = zip(*docs_golds) |         docs, golds = zip(*docs_golds) | ||||||
|         docs = list(docs) |         docs = list(docs) | ||||||
|  | @ -403,8 +401,9 @@ class Language(object): | ||||||
|                 docs = list(pipe.pipe(docs)) |                 docs = list(pipe.pipe(docs)) | ||||||
|         assert len(docs) == len(golds) |         assert len(docs) == len(golds) | ||||||
|         for doc, gold in zip(docs, golds): |         for doc, gold in zip(docs, golds): | ||||||
|             scorer.score(doc, gold) |             if verbose: | ||||||
|             doc.tensor = None |                 print(doc) | ||||||
|  |             scorer.score(doc, gold, verbose=verbose) | ||||||
|         return scorer |         return scorer | ||||||
| 
 | 
 | ||||||
|     @contextmanager |     @contextmanager | ||||||
|  | @ -493,7 +492,6 @@ class Language(object): | ||||||
|         """ |         """ | ||||||
|         path = util.ensure_path(path) |         path = util.ensure_path(path) | ||||||
|         serializers = OrderedDict(( |         serializers = OrderedDict(( | ||||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), |  | ||||||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), |             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) |             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||||
|         )) |         )) | ||||||
|  | @ -505,6 +503,7 @@ class Language(object): | ||||||
|             if not hasattr(proc, 'to_disk'): |             if not hasattr(proc, 'to_disk'): | ||||||
|                 continue |                 continue | ||||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) |             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||||
|  |         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||||
|         util.to_disk(path, serializers, {p: False for p in disable}) |         util.to_disk(path, serializers, {p: False for p in disable}) | ||||||
| 
 | 
 | ||||||
|     def from_disk(self, path, disable=tuple()): |     def from_disk(self, path, disable=tuple()): | ||||||
|  |  | ||||||
|  | @ -38,7 +38,8 @@ class Lemmatizer(object): | ||||||
|         avoid lemmatization entirely. |         avoid lemmatization entirely. | ||||||
|         """ |         """ | ||||||
|         morphology = {} if morphology is None else morphology |         morphology = {} if morphology is None else morphology | ||||||
|         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] |         others = [key for key in morphology | ||||||
|  |                   if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] | ||||||
|         true_morph_key = morphology.get('morph', 0) |         true_morph_key = morphology.get('morph', 0) | ||||||
|         if univ_pos == 'noun' and morphology.get('Number') == 'sing': |         if univ_pos == 'noun' and morphology.get('Number') == 'sing': | ||||||
|             return True |             return True | ||||||
|  | @ -47,7 +48,9 @@ class Lemmatizer(object): | ||||||
|         # This maps 'VBP' to base form -- probably just need 'IS_BASE' |         # This maps 'VBP' to base form -- probably just need 'IS_BASE' | ||||||
|         # morphology |         # morphology | ||||||
|         elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ |         elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ | ||||||
|                                      morphology.get('Tense') == 'pres'): |                                      morphology.get('Tense') == 'pres' and \ | ||||||
|  |                                      morphology.get('Number') is None and \ | ||||||
|  |                                      not others): | ||||||
|             return True |             return True | ||||||
|         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': |         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': | ||||||
|             return True |             return True | ||||||
|  |  | ||||||
|  | @ -421,47 +421,69 @@ cdef class PhraseMatcher: | ||||||
|     cdef int max_length |     cdef int max_length | ||||||
|     cdef attr_t* _phrase_key |     cdef attr_t* _phrase_key | ||||||
| 
 | 
 | ||||||
|     def __init__(self, Vocab vocab, phrases, max_length=10): |     cdef public object _callbacks | ||||||
|  |     cdef public object _patterns | ||||||
|  | 
 | ||||||
|  |     def __init__(self, Vocab vocab, max_length=10): | ||||||
|         self.mem = Pool() |         self.mem = Pool() | ||||||
|         self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t)) |         self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t)) | ||||||
|         self.max_length = max_length |         self.max_length = max_length | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.matcher = Matcher(self.vocab, {}) |         self.matcher = Matcher(self.vocab) | ||||||
|         self.phrase_ids = PreshMap() |         self.phrase_ids = PreshMap() | ||||||
|         for phrase in phrases: |  | ||||||
|             if len(phrase) < max_length: |  | ||||||
|                 self.add(phrase) |  | ||||||
| 
 |  | ||||||
|         abstract_patterns = [] |         abstract_patterns = [] | ||||||
|         for length in range(1, max_length): |         for length in range(1, max_length): | ||||||
|             abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) |             abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) | ||||||
|         self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match) |         self.matcher.add('Candidate', None, *abstract_patterns) | ||||||
|  |         self._callbacks = {} | ||||||
| 
 | 
 | ||||||
|     def add(self, Doc tokens): |     def __len__(self): | ||||||
|         cdef int length = tokens.length |         raise NotImplementedError | ||||||
|         assert length < self.max_length |  | ||||||
|         tags = get_bilou(length) |  | ||||||
|         assert len(tags) == length, length |  | ||||||
| 
 | 
 | ||||||
|  |     def __contains__(self, key): | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|  |     def __reduce__(self): | ||||||
|  |         return (self.__class__, (self.vocab,), None, None) | ||||||
|  | 
 | ||||||
|  |     def add(self, key, on_match, *docs): | ||||||
|  |         cdef Doc doc | ||||||
|  |         for doc in docs: | ||||||
|  |             if len(doc) >= self.max_length: | ||||||
|  |                 msg = ( | ||||||
|  |                     "Pattern length (%d) >= phrase_matcher.max_length (%d). " | ||||||
|  |                     "Length can be set on initialization, up to 10." | ||||||
|  |                 ) | ||||||
|  |                 raise ValueError(msg % (len(doc), self.max_length)) | ||||||
|  |         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||||
|  |         self._callbacks[ent_id] = on_match | ||||||
|  | 
 | ||||||
|  |         cdef int length | ||||||
|         cdef int i |         cdef int i | ||||||
|         for i in range(self.max_length): |         cdef hash_t phrase_hash | ||||||
|             self._phrase_key[i] = 0 |         for doc in docs: | ||||||
|         for i, tag in enumerate(tags): |             length = doc.length | ||||||
|             lexeme = self.vocab[tokens.c[i].lex.orth] |             tags = get_bilou(length) | ||||||
|             lexeme.set_flag(tag, True) |             for i in range(self.max_length): | ||||||
|             self._phrase_key[i] = lexeme.orth |                 self._phrase_key[i] = 0 | ||||||
|         cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) |             for i, tag in enumerate(tags): | ||||||
|         self.phrase_ids[key] = True |                 lexeme = self.vocab[doc.c[i].lex.orth] | ||||||
|  |                 lexeme.set_flag(tag, True) | ||||||
|  |                 self._phrase_key[i] = lexeme.orth | ||||||
|  |             phrase_hash = hash64(self._phrase_key, | ||||||
|  |                                  self.max_length * sizeof(attr_t), 0) | ||||||
|  |             self.phrase_ids.set(phrase_hash, <void*>ent_id) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, Doc doc): |     def __call__(self, Doc doc): | ||||||
|         matches = [] |         matches = [] | ||||||
|         for ent_id, label, start, end in self.matcher(doc): |         for _, start, end in self.matcher(doc): | ||||||
|             cand = doc[start : end] |             ent_id = self.accept_match(doc, start, end) | ||||||
|             start = cand[0].idx |             if ent_id is not None: | ||||||
|             end = cand[-1].idx + len(cand[-1]) |                 matches.append((ent_id, start, end)) | ||||||
|             matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) |         for i, (ent_id, start, end) in enumerate(matches): | ||||||
|         for match in matches: |             on_match = self._callbacks.get(ent_id) | ||||||
|             doc.merge(*match) |             if on_match is not None: | ||||||
|  |                 on_match(self, doc, i, matches) | ||||||
|         return matches |         return matches | ||||||
| 
 | 
 | ||||||
|     def pipe(self, stream, batch_size=1000, n_threads=2): |     def pipe(self, stream, batch_size=1000, n_threads=2): | ||||||
|  | @ -469,7 +491,7 @@ cdef class PhraseMatcher: | ||||||
|             self(doc) |             self(doc) | ||||||
|             yield doc |             yield doc | ||||||
| 
 | 
 | ||||||
|     def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end): |     def accept_match(self, Doc doc, int start, int end): | ||||||
|         assert (end - start) < self.max_length |         assert (end - start) < self.max_length | ||||||
|         cdef int i, j |         cdef int i, j | ||||||
|         for i in range(self.max_length): |         for i in range(self.max_length): | ||||||
|  | @ -477,7 +499,8 @@ cdef class PhraseMatcher: | ||||||
|         for i, j in enumerate(range(start, end)): |         for i, j in enumerate(range(start, end)): | ||||||
|             self._phrase_key[i] = doc.c[j].lex.orth |             self._phrase_key[i] = doc.c[j].lex.orth | ||||||
|         cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) |         cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) | ||||||
|         if self.phrase_ids.get(key): |         ent_id = <hash_t>self.phrase_ids.get(key) | ||||||
|             return (ent_id, label, start, end) |         if ent_id == 0: | ||||||
|  |             return None | ||||||
|         else: |         else: | ||||||
|             return False |             return ent_id | ||||||
|  |  | ||||||
|  | @ -146,6 +146,8 @@ cdef class Morphology: | ||||||
|                 self.add_special_case(tag_str, form_str, attrs) |                 self.add_special_case(tag_str, form_str, attrs) | ||||||
| 
 | 
 | ||||||
|     def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): |     def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): | ||||||
|  |         if orth not in self.strings: | ||||||
|  |             return orth | ||||||
|         cdef unicode py_string = self.strings[orth] |         cdef unicode py_string = self.strings[orth] | ||||||
|         if self.lemmatizer is None: |         if self.lemmatizer is None: | ||||||
|             return self.strings.add(py_string.lower()) |             return self.strings.add(py_string.lower()) | ||||||
|  |  | ||||||
|  | @ -4,7 +4,6 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from thinc.api import chain, layerize, with_getitem | from thinc.api import chain, layerize, with_getitem | ||||||
| from thinc.neural import Model, Softmax |  | ||||||
| import numpy | import numpy | ||||||
| cimport numpy as np | cimport numpy as np | ||||||
| import cytoolz | import cytoolz | ||||||
|  | @ -14,17 +13,18 @@ import ujson | ||||||
| import msgpack | import msgpack | ||||||
| 
 | 
 | ||||||
| from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | ||||||
| from thinc.neural import Model, Maxout, Softmax, Affine | from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||||
| from thinc.neural._classes.hash_embed import HashEmbed | from thinc.i2v import HashEmbed | ||||||
|  | from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool | ||||||
|  | from thinc.t2t import ExtractWindow, ParametricAttention | ||||||
|  | from thinc.misc import Residual | ||||||
|  | from thinc.misc import BatchNorm as BN | ||||||
|  | from thinc.misc import LayerNorm as LN | ||||||
|  | 
 | ||||||
| from thinc.neural.util import to_categorical | from thinc.neural.util import to_categorical | ||||||
| 
 | 
 | ||||||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool |  | ||||||
| from thinc.neural._classes.difference import Siamese, CauchySimilarity | from thinc.neural._classes.difference import Siamese, CauchySimilarity | ||||||
| 
 | 
 | ||||||
| from thinc.neural._classes.convolution import ExtractWindow |  | ||||||
| from thinc.neural._classes.resnet import Residual |  | ||||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN |  | ||||||
| 
 |  | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .syntax.parser cimport Parser as LinearParser | from .syntax.parser cimport Parser as LinearParser | ||||||
| from .syntax.nn_parser cimport Parser as NeuralParser | from .syntax.nn_parser cimport Parser as NeuralParser | ||||||
|  | @ -41,13 +41,14 @@ from .syntax import nonproj | ||||||
| from .compat import json_dumps | from .compat import json_dumps | ||||||
| 
 | 
 | ||||||
| from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | ||||||
| from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats | from ._ml import rebatch, Tok2Vec, flatten | ||||||
| from ._ml import build_text_classifier, build_tagger_model | from ._ml import build_text_classifier, build_tagger_model | ||||||
|  | from ._ml import link_vectors_to_models | ||||||
| from .parts_of_speech import X | from .parts_of_speech import X | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SentenceSegmenter(object): | class SentenceSegmenter(object): | ||||||
|     '''A simple spaCy hook, to allow custom sentence boundary detection logic |     """A simple spaCy hook, to allow custom sentence boundary detection logic | ||||||
|     (that doesn't require the dependency parse). |     (that doesn't require the dependency parse). | ||||||
| 
 | 
 | ||||||
|     To change the sentence boundary detection strategy, pass a generator |     To change the sentence boundary detection strategy, pass a generator | ||||||
|  | @ -56,7 +57,7 @@ class SentenceSegmenter(object): | ||||||
| 
 | 
 | ||||||
|     Sentence detection strategies should be generators that take `Doc` objects |     Sentence detection strategies should be generators that take `Doc` objects | ||||||
|     and yield `Span` objects for each sentence. |     and yield `Span` objects for each sentence. | ||||||
|     ''' |     """ | ||||||
|     name = 'sbd' |     name = 'sbd' | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab, strategy=None): |     def __init__(self, vocab, strategy=None): | ||||||
|  | @ -88,17 +89,30 @@ class BaseThincComponent(object): | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, *shape, **kwargs): |     def Model(cls, *shape, **kwargs): | ||||||
|  |         """Initialize a model for the pipe.""" | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab, model=True, **cfg): |     def __init__(self, vocab, model=True, **cfg): | ||||||
|  |         """Create a new pipe instance.""" | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|  |         """Apply the pipe to one document. The document is | ||||||
|  |         modified in-place, and returned. | ||||||
|  | 
 | ||||||
|  |         Both __call__ and pipe should delegate to the `predict()` | ||||||
|  |         and `set_annotations()` methods. | ||||||
|  |         """ | ||||||
|         scores = self.predict([doc]) |         scores = self.predict([doc]) | ||||||
|         self.set_annotations([doc], scores) |         self.set_annotations([doc], scores) | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): |     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||||
|  |         """Apply the pipe to a stream of documents. | ||||||
|  | 
 | ||||||
|  |         Both __call__ and pipe should delegate to the `predict()` | ||||||
|  |         and `set_annotations()` methods. | ||||||
|  |         """ | ||||||
|         for docs in cytoolz.partition_all(batch_size, stream): |         for docs in cytoolz.partition_all(batch_size, stream): | ||||||
|             docs = list(docs) |             docs = list(docs) | ||||||
|             scores = self.predict(docs) |             scores = self.predict(docs) | ||||||
|  | @ -106,27 +120,43 @@ class BaseThincComponent(object): | ||||||
|             yield from docs |             yield from docs | ||||||
| 
 | 
 | ||||||
|     def predict(self, docs): |     def predict(self, docs): | ||||||
|  |         """Apply the pipeline's model to a batch of docs, without | ||||||
|  |         modifying them. | ||||||
|  |         """ | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs, scores): |     def set_annotations(self, docs, scores): | ||||||
|  |         """Modify a batch of documents, using pre-computed scores.""" | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||||
|  |         """Learn from a batch of documents and gold-standard information, | ||||||
|  |         updating the pipe's model. | ||||||
|  | 
 | ||||||
|  |         Delegates to predict() and get_loss(). | ||||||
|  |         """ | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|  |         """Find the loss and gradient of loss for the batch of | ||||||
|  |         documents and their predicted scores.""" | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): |     def begin_training(self, gold_tuples=tuple(), pipeline=None): | ||||||
|         token_vector_width = pipeline[0].model.nO |         """Initialize the pipe for training, using data exampes if available. | ||||||
|  |         If no model has been initialized yet, the model is added.""" | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(1, token_vector_width) |             self.model = self.Model(**self.cfg) | ||||||
|  |         link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
|     def use_params(self, params): |     def use_params(self, params): | ||||||
|  |         """Modify the pipe's model, to use the given parameter values. | ||||||
|  |         """ | ||||||
|         with self.model.use_params(params): |         with self.model.use_params(params): | ||||||
|             yield |             yield | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self, **exclude): |     def to_bytes(self, **exclude): | ||||||
|  |         """Serialize the pipe to a bytestring.""" | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict(( | ||||||
|             ('cfg', lambda: json_dumps(self.cfg)), |             ('cfg', lambda: json_dumps(self.cfg)), | ||||||
|             ('model', lambda: self.model.to_bytes()), |             ('model', lambda: self.model.to_bytes()), | ||||||
|  | @ -135,37 +165,42 @@ class BaseThincComponent(object): | ||||||
|         return util.to_bytes(serialize, exclude) |         return util.to_bytes(serialize, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, bytes_data, **exclude): |     def from_bytes(self, bytes_data, **exclude): | ||||||
|  |         """Load the pipe from a bytestring.""" | ||||||
|         def load_model(b): |         def load_model(b): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|  |                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|                 self.model = self.Model(**self.cfg) |                 self.model = self.Model(**self.cfg) | ||||||
|             self.model.from_bytes(b) |             self.model.from_bytes(b) | ||||||
| 
 | 
 | ||||||
|         deserialize = OrderedDict(( |         deserialize = OrderedDict(( | ||||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), |             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||||
|  |             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||||
|             ('model', load_model), |             ('model', load_model), | ||||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)) |  | ||||||
|         )) |         )) | ||||||
|         util.from_bytes(bytes_data, deserialize, exclude) |         util.from_bytes(bytes_data, deserialize, exclude) | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def to_disk(self, path, **exclude): |     def to_disk(self, path, **exclude): | ||||||
|  |         """Serialize the pipe to disk.""" | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict(( | ||||||
|             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), |             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), | ||||||
|  |             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||||
|             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), |             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), | ||||||
|             ('vocab', lambda p: self.vocab.to_disk(p)) |  | ||||||
|         )) |         )) | ||||||
|         util.to_disk(path, serialize, exclude) |         util.to_disk(path, serialize, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_disk(self, path, **exclude): |     def from_disk(self, path, **exclude): | ||||||
|  |         """Load the pipe from disk.""" | ||||||
|         def load_model(p): |         def load_model(p): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|  |                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|                 self.model = self.Model(**self.cfg) |                 self.model = self.Model(**self.cfg) | ||||||
|             self.model.from_bytes(p.open('rb').read()) |             self.model.from_bytes(p.open('rb').read()) | ||||||
| 
 | 
 | ||||||
|         deserialize = OrderedDict(( |         deserialize = OrderedDict(( | ||||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), |             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), | ||||||
|             ('model', load_model), |  | ||||||
|             ('vocab', lambda p: self.vocab.from_disk(p)), |             ('vocab', lambda p: self.vocab.from_disk(p)), | ||||||
|  |             ('model', load_model), | ||||||
|         )) |         )) | ||||||
|         util.from_disk(path, deserialize, exclude) |         util.from_disk(path, deserialize, exclude) | ||||||
|         return self |         return self | ||||||
|  | @ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         """ |         """ | ||||||
|         width = util.env_opt('token_vector_width', width) |         width = util.env_opt('token_vector_width', width) | ||||||
|         embed_size = util.env_opt('embed_size', embed_size) |         embed_size = util.env_opt('embed_size', embed_size) | ||||||
|         return Tok2Vec(width, embed_size, preprocess=None) |         return Tok2Vec(width, embed_size, **cfg) | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab, model=True, **cfg): |     def __init__(self, vocab, model=True, **cfg): | ||||||
|         """Construct a new statistical model. Weights are not allocated on |         """Construct a new statistical model. Weights are not allocated on | ||||||
|  | @ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|             >>> tok2vec.model = tok2vec.Model(128, 5000) |             >>> tok2vec.model = tok2vec.Model(128, 5000) | ||||||
|         """ |         """ | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.doc2feats = doc2feats() |  | ||||||
|         self.model = model |         self.model = model | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
|  |         self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||||
|  |         self.cfg.setdefault('cnn_maxout_pieces', 3) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|         """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM |         """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM | ||||||
|  | @ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         docs (iterable): A sequence of `Doc` objects. |         docs (iterable): A sequence of `Doc` objects. | ||||||
|         RETURNS (object): Vector representations for each token in the documents. |         RETURNS (object): Vector representations for each token in the documents. | ||||||
|         """ |         """ | ||||||
|         feats = self.doc2feats(docs) |         tokvecs = self.model(docs) | ||||||
|         tokvecs = self.model(feats) |  | ||||||
|         return tokvecs |         return tokvecs | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs, tokvecses): |     def set_annotations(self, docs, tokvecses): | ||||||
|  | @ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         """ |         """ | ||||||
|         if isinstance(docs, Doc): |         if isinstance(docs, Doc): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
|         feats = self.doc2feats(docs) |         tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop) | ||||||
|         tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) |  | ||||||
|         return tokvecs, bp_tokvecs |         return tokvecs, bp_tokvecs | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|  | @ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent): | ||||||
|         gold_tuples (iterable): Gold-standard training data. |         gold_tuples (iterable): Gold-standard training data. | ||||||
|         pipeline (list): The pipeline the model is part of. |         pipeline (list): The pipeline the model is part of. | ||||||
|         """ |         """ | ||||||
|         self.doc2feats = doc2feats() |  | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model() |             self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|  |             self.model = self.Model(**self.cfg) | ||||||
|  |         link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class NeuralTagger(BaseThincComponent): | class NeuralTagger(BaseThincComponent): | ||||||
|  | @ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent): | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.model = model |         self.model = model | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
|  |         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||||
|  |         self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|         tags = self.predict(([doc], [doc.tensor])) |         tags = self.predict([doc]) | ||||||
|         self.set_annotations([doc], tags) |         self.set_annotations([doc], tags) | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): |     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||||
|         for docs in cytoolz.partition_all(batch_size, stream): |         for docs in cytoolz.partition_all(batch_size, stream): | ||||||
|             docs = list(docs) |             docs = list(docs) | ||||||
|             tokvecs = [d.tensor for d in docs] |             tag_ids = self.predict(docs) | ||||||
|             tag_ids = self.predict((docs, tokvecs)) |  | ||||||
|             self.set_annotations(docs, tag_ids) |             self.set_annotations(docs, tag_ids) | ||||||
|             yield from docs |             yield from docs | ||||||
| 
 | 
 | ||||||
|     def predict(self, docs_tokvecs): |     def predict(self, docs): | ||||||
|         scores = self.model(docs_tokvecs) |         scores = self.model(docs) | ||||||
|         scores = self.model.ops.flatten(scores) |         scores = self.model.ops.flatten(scores) | ||||||
|         guesses = scores.argmax(axis=1) |         guesses = scores.argmax(axis=1) | ||||||
|         if not isinstance(guesses, numpy.ndarray): |         if not isinstance(guesses, numpy.ndarray): | ||||||
|             guesses = guesses.get() |             guesses = guesses.get() | ||||||
|         tokvecs = docs_tokvecs[1] |  | ||||||
|         guesses = self.model.ops.unflatten(guesses, |         guesses = self.model.ops.unflatten(guesses, | ||||||
|                     [tv.shape[0] for tv in tokvecs]) |                     [len(d) for d in docs]) | ||||||
|         return guesses |         return guesses | ||||||
| 
 | 
 | ||||||
|     def set_annotations(self, docs, batch_tag_ids): |     def set_annotations(self, docs, batch_tag_ids): | ||||||
|  | @ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent): | ||||||
|                 idx += 1 |                 idx += 1 | ||||||
|         doc.is_tagged = True |         doc.is_tagged = True | ||||||
| 
 | 
 | ||||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||||
|         if losses is not None and self.name not in losses: |         if losses is not None and self.name not in losses: | ||||||
|             losses[self.name] = 0. |             losses[self.name] = 0. | ||||||
|         docs, tokvecs = docs_tokvecs |  | ||||||
| 
 | 
 | ||||||
|         if self.model.nI is None: |         tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) | ||||||
|             self.model.nI = tokvecs[0].shape[1] |  | ||||||
|         tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) |  | ||||||
|         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) |         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) | ||||||
|  |         bp_tag_scores(d_tag_scores, sgd=sgd) | ||||||
| 
 | 
 | ||||||
|         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) |  | ||||||
|         if losses is not None: |         if losses is not None: | ||||||
|             losses[self.name] += loss |             losses[self.name] += loss | ||||||
|         return d_tokvecs |  | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|         scores = self.model.ops.flatten(scores) |         scores = self.model.ops.flatten(scores) | ||||||
|  | @ -392,13 +423,14 @@ class NeuralTagger(BaseThincComponent): | ||||||
|             vocab.morphology = Morphology(vocab.strings, new_tag_map, |             vocab.morphology = Morphology(vocab.strings, new_tag_map, | ||||||
|                                           vocab.morphology.lemmatizer, |                                           vocab.morphology.lemmatizer, | ||||||
|                                           exc=vocab.morphology.exc) |                                           exc=vocab.morphology.exc) | ||||||
|         token_vector_width = pipeline[0].model.nO |  | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |             self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||||
|  |             self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||||
|  |         link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, n_tags, token_vector_width): |     def Model(cls, n_tags, **cfg): | ||||||
|         return build_tagger_model(n_tags, token_vector_width) |         return build_tagger_model(n_tags, **cfg) | ||||||
| 
 | 
 | ||||||
|     def use_params(self, params): |     def use_params(self, params): | ||||||
|         with self.model.use_params(params): |         with self.model.use_params(params): | ||||||
|  | @ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 token_vector_width = util.env_opt('token_vector_width', |                 token_vector_width = util.env_opt('token_vector_width', | ||||||
|                         self.cfg.get('token_vector_width', 128)) |                         self.cfg.get('token_vector_width', 128)) | ||||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||||
|             self.model.from_bytes(b) |             self.model.from_bytes(b) | ||||||
| 
 | 
 | ||||||
|         def load_tag_map(b): |         def load_tag_map(b): | ||||||
|  | @ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent): | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def to_disk(self, path, **exclude): |     def to_disk(self, path, **exclude): | ||||||
|  |         self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||||
|         serialize = OrderedDict(( |         serialize = OrderedDict(( | ||||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), |             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||||
|             ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( |             ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( | ||||||
|  | @ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent): | ||||||
|     def from_disk(self, path, **exclude): |     def from_disk(self, path, **exclude): | ||||||
|         def load_model(p): |         def load_model(p): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 token_vector_width = util.env_opt('token_vector_width', |                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||||
|                         self.cfg.get('token_vector_width', 128)) |  | ||||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) |  | ||||||
|             self.model.from_bytes(p.open('rb').read()) |             self.model.from_bytes(p.open('rb').read()) | ||||||
| 
 | 
 | ||||||
|         def load_tag_map(p): |         def load_tag_map(p): | ||||||
|  | @ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent): | ||||||
|                 exc=self.vocab.morphology.exc) |                 exc=self.vocab.morphology.exc) | ||||||
| 
 | 
 | ||||||
|         deserialize = OrderedDict(( |         deserialize = OrderedDict(( | ||||||
|  |             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), | ||||||
|             ('vocab', lambda p: self.vocab.from_disk(p)), |             ('vocab', lambda p: self.vocab.from_disk(p)), | ||||||
|             ('tag_map', load_tag_map), |             ('tag_map', load_tag_map), | ||||||
|             ('model', load_model), |             ('model', load_model), | ||||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))) |  | ||||||
|         )) |         )) | ||||||
|         util.from_disk(path, deserialize, exclude) |         util.from_disk(path, deserialize, exclude) | ||||||
|         return self |         return self | ||||||
|  | @ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent): | ||||||
| 
 | 
 | ||||||
| class NeuralLabeller(NeuralTagger): | class NeuralLabeller(NeuralTagger): | ||||||
|     name = 'nn_labeller' |     name = 'nn_labeller' | ||||||
|     def __init__(self, vocab, model=True, **cfg): |     def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.model = model |         self.model = model | ||||||
|  |         if target == 'dep': | ||||||
|  |             self.make_label = self.make_dep | ||||||
|  |         elif target == 'tag': | ||||||
|  |             self.make_label = self.make_tag | ||||||
|  |         elif target == 'ent': | ||||||
|  |             self.make_label = self.make_ent | ||||||
|  |         elif target == 'dep_tag_offset': | ||||||
|  |             self.make_label = self.make_dep_tag_offset | ||||||
|  |         elif target == 'ent_tag': | ||||||
|  |             self.make_label = self.make_ent_tag | ||||||
|  |         elif hasattr(target, '__call__'): | ||||||
|  |             self.make_label = target | ||||||
|  |         else: | ||||||
|  |             raise ValueError( | ||||||
|  |                 "NeuralLabeller target should be function or one of " | ||||||
|  |                 "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
|  |         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||||
|  |         self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def labels(self): |     def labels(self): | ||||||
|  | @ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger): | ||||||
|     def set_annotations(self, docs, dep_ids): |     def set_annotations(self, docs, dep_ids): | ||||||
|         pass |         pass | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): |     def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): | ||||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) |         gold_tuples = nonproj.preprocess_training_data(gold_tuples) | ||||||
|         for raw_text, annots_brackets in gold_tuples: |         for raw_text, annots_brackets in gold_tuples: | ||||||
|             for annots, brackets in annots_brackets: |             for annots, brackets in annots_brackets: | ||||||
|                 ids, words, tags, heads, deps, ents = annots |                 ids, words, tags, heads, deps, ents = annots | ||||||
|                 for dep in deps: |                 for i in range(len(ids)): | ||||||
|                     if dep not in self.labels: |                     label = self.make_label(i, words, tags, heads, deps, ents) | ||||||
|                         self.labels[dep] = len(self.labels) |                     if label is not None and label not in self.labels: | ||||||
|         token_vector_width = pipeline[0].model.nO |                         self.labels[label] = len(self.labels) | ||||||
|  |         print(len(self.labels)) | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(len(self.labels), token_vector_width) |             token_vector_width = util.env_opt('token_vector_width') | ||||||
|  |             self.model = chain( | ||||||
|  |                 tok2vec, | ||||||
|  |                 Softmax(len(self.labels), token_vector_width) | ||||||
|  |             ) | ||||||
|  |         link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, n_tags, token_vector_width): |     def Model(cls, n_tags, tok2vec=None, **cfg): | ||||||
|         return build_tagger_model(n_tags, token_vector_width) |         return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg) | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|         scores = self.model.ops.flatten(scores) |  | ||||||
|         cdef int idx = 0 |         cdef int idx = 0 | ||||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') |         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||||
|         guesses = scores.argmax(axis=1) |         guesses = scores.argmax(axis=1) | ||||||
|         for gold in golds: |         for gold in golds: | ||||||
|             for tag in gold.labels: |             for i in range(len(gold.labels)): | ||||||
|                 if tag is None or tag not in self.labels: |                 label = self.make_label(i, gold.words, gold.tags, gold.heads, | ||||||
|  |                                         gold.labels, gold.ents) | ||||||
|  |                 if label is None or label not in self.labels: | ||||||
|                     correct[idx] = guesses[idx] |                     correct[idx] = guesses[idx] | ||||||
|                 else: |                 else: | ||||||
|                     correct[idx] = self.labels[tag] |                     correct[idx] = self.labels[label] | ||||||
|                 idx += 1 |                 idx += 1 | ||||||
|         correct = self.model.ops.xp.array(correct, dtype='i') |         correct = self.model.ops.xp.array(correct, dtype='i') | ||||||
|         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) |         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) | ||||||
|         d_scores /= d_scores.shape[0] |         d_scores /= d_scores.shape[0] | ||||||
|         loss = (d_scores**2).sum() |         loss = (d_scores**2).sum() | ||||||
|         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) |  | ||||||
|         return float(loss), d_scores |         return float(loss), d_scores | ||||||
| 
 | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def make_dep(i, words, tags, heads, deps, ents): | ||||||
|  |         if deps[i] is None or heads[i] is None: | ||||||
|  |             return None | ||||||
|  |         return deps[i] | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def make_tag(i, words, tags, heads, deps, ents): | ||||||
|  |         return tags[i] | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def make_ent(i, words, tags, heads, deps, ents): | ||||||
|  |         if ents is None: | ||||||
|  |             return None | ||||||
|  |         return ents[i] | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def make_dep_tag_offset(i, words, tags, heads, deps, ents): | ||||||
|  |         if deps[i] is None or heads[i] is None: | ||||||
|  |             return None | ||||||
|  |         offset = heads[i] - i | ||||||
|  |         offset = min(offset, 2) | ||||||
|  |         offset = max(offset, -2) | ||||||
|  |         return '%s-%s:%d' % (deps[i], tags[i], offset) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def make_ent_tag(i, words, tags, heads, deps, ents): | ||||||
|  |         if ents is None or ents[i] is None: | ||||||
|  |             return None | ||||||
|  |         else: | ||||||
|  |             return '%s-%s' % (tags[i], ents[i]) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class SimilarityHook(BaseThincComponent): | class SimilarityHook(BaseThincComponent): | ||||||
|     """ |     """ | ||||||
|  | @ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent): | ||||||
|         return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) |         return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, doc): |     def __call__(self, doc): | ||||||
|         '''Install similarity hook''' |         """Install similarity hook""" | ||||||
|         doc.user_hooks['similarity'] = self.predict |         doc.user_hooks['similarity'] = self.predict | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|  | @ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent): | ||||||
|             yield self(doc) |             yield self(doc) | ||||||
| 
 | 
 | ||||||
|     def predict(self, doc1, doc2): |     def predict(self, doc1, doc2): | ||||||
|         return self.model.predict([(doc1.tensor, doc2.tensor)]) |         return self.model.predict([(doc1, doc2)]) | ||||||
| 
 | 
 | ||||||
|     def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.): |     def update(self, doc1_doc2, golds, sgd=None, drop=0.): | ||||||
|         doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2 |         sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) | ||||||
|         sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s), |  | ||||||
|                                                 drop=drop) |  | ||||||
|         d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd) |  | ||||||
| 
 |  | ||||||
|         return d_tensor1s, d_tensor2s |  | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, _=tuple(), pipeline=None): |     def begin_training(self, _=tuple(), pipeline=None): | ||||||
|         """ |         """ | ||||||
|  | @ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent): | ||||||
|         """ |         """ | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|             self.model = self.Model(pipeline[0].model.nO) |             self.model = self.Model(pipeline[0].model.nO) | ||||||
|  |             link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class TextCategorizer(BaseThincComponent): | class TextCategorizer(BaseThincComponent): | ||||||
|  | @ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent): | ||||||
|             for j, label in enumerate(self.labels): |             for j, label in enumerate(self.labels): | ||||||
|                 doc.cats[label] = float(scores[i, j]) |                 doc.cats[label] = float(scores[i, j]) | ||||||
| 
 | 
 | ||||||
|     def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): | ||||||
|         docs, tensors = docs_tensors |  | ||||||
|         scores, bp_scores = self.model.begin_update(docs, drop=drop) |         scores, bp_scores = self.model.begin_update(docs, drop=drop) | ||||||
|         loss, d_scores = self.get_loss(docs, golds, scores) |         loss, d_scores = self.get_loss(docs, golds, scores) | ||||||
|         d_tensors = bp_scores(d_scores, sgd=sgd) |         bp_scores(d_scores, sgd=sgd) | ||||||
|         if losses is not None: |         if losses is not None: | ||||||
|             losses.setdefault(self.name, 0.0) |             losses.setdefault(self.name, 0.0) | ||||||
|             losses[self.name] += loss |             losses[self.name] += loss | ||||||
|         return d_tensors |  | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|         truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') |         truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') | ||||||
|  | @ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent): | ||||||
|         else: |         else: | ||||||
|             token_vector_width = 64 |             token_vector_width = 64 | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|  |             self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|             self.model = self.Model(len(self.labels), token_vector_width, |             self.model = self.Model(len(self.labels), token_vector_width, | ||||||
|                                     **self.cfg) |                                     **self.cfg) | ||||||
|  |             link_vectors_to_models(self.vocab) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class EntityRecognizer(LinearParser): | cdef class EntityRecognizer(LinearParser): | ||||||
|  | @ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser): | ||||||
|     name = 'parser' |     name = 'parser' | ||||||
|     TransitionSystem = ArcEager |     TransitionSystem = ArcEager | ||||||
| 
 | 
 | ||||||
|  |     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||||
|  |         for target in []: | ||||||
|  |             labeller = NeuralLabeller(self.vocab, target=target) | ||||||
|  |             tok2vec = self.model[0] | ||||||
|  |             labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) | ||||||
|  |             pipeline.append(labeller) | ||||||
|  |             self._multitasks.append(labeller) | ||||||
|  | 
 | ||||||
|     def __reduce__(self): |     def __reduce__(self): | ||||||
|         return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) |         return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) | ||||||
| 
 | 
 | ||||||
|  | @ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser): | ||||||
| 
 | 
 | ||||||
|     nr_feature = 6 |     nr_feature = 6 | ||||||
| 
 | 
 | ||||||
|     def predict_confidences(self, docs): |     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||||
|         tensors = [d.tensor for d in docs] |         for target in []: | ||||||
|         samples = [] |             labeller = NeuralLabeller(self.vocab, target=target) | ||||||
|         for i in range(10): |             tok2vec = self.model[0] | ||||||
|             states = self.parse_batch(docs, tensors, drop=0.3) |             labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) | ||||||
|             for state in states: |             pipeline.append(labeller) | ||||||
|                 samples.append(self._get_entities(state)) |             self._multitasks.append(labeller) | ||||||
| 
 | 
 | ||||||
|     def __reduce__(self): |     def __reduce__(self): | ||||||
|         return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) |         return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| cpdef enum symbol_t: | cdef enum symbol_t: | ||||||
|     NIL |     NIL | ||||||
|     IS_ALPHA |     IS_ALPHA | ||||||
|     IS_ASCII |     IS_ASCII | ||||||
|  |  | ||||||
|  | @ -1,4 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
|  | #cython: optimize.unpack_method_calls=False | ||||||
|  | 
 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| IDS = { | IDS = { | ||||||
|  | @ -458,4 +460,11 @@ IDS = { | ||||||
|     "xcomp": xcomp |     "xcomp": xcomp | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] | def sort_nums(x): | ||||||
|  |     return x[1] | ||||||
|  | 
 | ||||||
|  | NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] | ||||||
|  | # Unfortunate hack here, to work around problem with long cpdef enum | ||||||
|  | # (which is generating an enormous amount of C++ in Cython 0.24+) | ||||||
|  | # We keep the enum cdef, and just make sure the names are available to Python | ||||||
|  | locals().update(IDS) | ||||||
|  |  | ||||||
|  | @ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens): | ||||||
| 
 | 
 | ||||||
| nr_update = 0 | nr_update = 0 | ||||||
| def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||||
|                 states, tokvecs, golds, |                 states, golds, | ||||||
|                 state2vec, vec2scores,  |                 state2vec, vec2scores,  | ||||||
|                 int width, float density, |                 int width, float density, | ||||||
|                 sgd=None, losses=None, drop=0.): |                 losses=None, drop=0.): | ||||||
|     global nr_update |     global nr_update | ||||||
|     cdef MaxViolation violn |     cdef MaxViolation violn | ||||||
|     nr_update += 1 |     nr_update += 1 | ||||||
|  |  | ||||||
|  | @ -101,9 +101,10 @@ cdef cppclass StateC: | ||||||
|         elif n == 6: |         elif n == 6: | ||||||
|             if this.B(0) >= 0: |             if this.B(0) >= 0: | ||||||
|                 ids[0] = this.B(0) |                 ids[0] = this.B(0) | ||||||
|  |                 ids[1] = this.B(0)-1 | ||||||
|             else: |             else: | ||||||
|                 ids[0] = -1 |                 ids[0] = -1 | ||||||
|             ids[1] = this.B(0) |                 ids[1] = -1 | ||||||
|             ids[2] = this.B(1) |             ids[2] = this.B(1) | ||||||
|             ids[3] = this.E(0) |             ids[3] = this.E(0) | ||||||
|             if ids[3] >= 1: |             if ids[3] >= 1: | ||||||
|  | @ -120,6 +121,8 @@ cdef cppclass StateC: | ||||||
|         for i in range(n): |         for i in range(n): | ||||||
|             if ids[i] >= 0: |             if ids[i] >= 0: | ||||||
|                 ids[i] += this.offset |                 ids[i] += this.offset | ||||||
|  |             else: | ||||||
|  |                 ids[i] = -1 | ||||||
| 
 | 
 | ||||||
|     int S(int i) nogil const: |     int S(int i) nogil const: | ||||||
|         if i >= this._s_i: |         if i >= this._s_i: | ||||||
|  | @ -162,9 +165,9 @@ cdef cppclass StateC: | ||||||
| 
 | 
 | ||||||
|     int E(int i) nogil const: |     int E(int i) nogil const: | ||||||
|         if this._e_i <= 0 or this._e_i >= this.length: |         if this._e_i <= 0 or this._e_i >= this.length: | ||||||
|             return 0 |             return -1 | ||||||
|         if i < 0 or i >= this._e_i: |         if i < 0 or i >= this._e_i: | ||||||
|             return 0 |             return -1 | ||||||
|         return this._ents[this._e_i - (i+1)].start |         return this._ents[this._e_i - (i+1)].start | ||||||
| 
 | 
 | ||||||
|     int L(int i, int idx) nogil const: |     int L(int i, int idx) nogil const: | ||||||
|  |  | ||||||
|  | @ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|     cdef Transition lookup_transition(self, object name) except *: |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|         cdef attr_t label |         cdef attr_t label | ||||||
|         if name == '-' or name == None: |         if name == '-' or name == None: | ||||||
|             move_str = 'M' |             return Transition(clas=0, move=MISSING, label=0, score=0) | ||||||
|             label = 0 |  | ||||||
|         elif name == '!O': |         elif name == '!O': | ||||||
|             return Transition(clas=0, move=ISNT, label=0, score=0) |             return Transition(clas=0, move=ISNT, label=0, score=0) | ||||||
|         elif '-' in name: |         elif '-' in name: | ||||||
|  | @ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|             raise Exception(move) |             raise Exception(move) | ||||||
|         return t |         return t | ||||||
| 
 | 
 | ||||||
|  |     #def add_action(self, int action, label_name): | ||||||
|  |     #    cdef attr_t label_id | ||||||
|  |     #    if not isinstance(label_name, (int, long)): | ||||||
|  |     #        label_id = self.strings.add(label_name) | ||||||
|  |     #    else: | ||||||
|  |     #        label_id = label_name | ||||||
|  |     #    if action == OUT and label_id != 0: | ||||||
|  |     #        return | ||||||
|  |     #    if action == MISSING or action == ISNT: | ||||||
|  |     #        return | ||||||
|  |     #    # Check we're not creating a move we already have, so that this is | ||||||
|  |     #    # idempotent | ||||||
|  |     #    for trans in self.c[:self.n_moves]: | ||||||
|  |     #        if trans.move == action and trans.label == label_id: | ||||||
|  |     #            return 0 | ||||||
|  |     #    if self.n_moves >= self._size: | ||||||
|  |     #        self._size *= 2 | ||||||
|  |     #        self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||||
|  |     #    self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||||
|  |     #    assert self.c[self.n_moves].label == label_id | ||||||
|  |     #    self.n_moves += 1 | ||||||
|  |     #    return 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|     cdef int initialize_state(self, StateC* st) nogil: |     cdef int initialize_state(self, StateC* st) nogil: | ||||||
|         # This is especially necessary when we use limited training data. |         # This is especially necessary when we use limited training data. | ||||||
|         for i in range(st.length): |         for i in range(st.length): | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ cdef class Parser: | ||||||
|     cdef public object model |     cdef public object model | ||||||
|     cdef readonly TransitionSystem moves |     cdef readonly TransitionSystem moves | ||||||
|     cdef readonly object cfg |     cdef readonly object cfg | ||||||
|  |     cdef public object _multitasks | ||||||
| 
 | 
 | ||||||
|     cdef void _parse_step(self, StateC* state, |     cdef void _parse_step(self, StateC* state, | ||||||
|             const float* feat_weights, |             const float* feat_weights, | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| from collections import Counter, OrderedDict | from collections import Counter, OrderedDict | ||||||
| import ujson | import ujson | ||||||
|  | import json | ||||||
| import contextlib | import contextlib | ||||||
| 
 | 
 | ||||||
| from libc.math cimport exp | from libc.math cimport exp | ||||||
|  | @ -37,10 +38,9 @@ from preshed.maps cimport MapStruct | ||||||
| from preshed.maps cimport map_get | from preshed.maps cimport map_get | ||||||
| 
 | 
 | ||||||
| from thinc.api import layerize, chain, noop, clone, with_flatten | from thinc.api import layerize, chain, noop, clone, with_flatten | ||||||
| from thinc.neural import Model, Affine, ReLu, Maxout | from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | from thinc.misc import LayerNorm | ||||||
| from thinc.neural._classes.selu import SELU | 
 | ||||||
| from thinc.neural._classes.layernorm import LayerNorm |  | ||||||
| from thinc.neural.ops import NumpyOps, CupyOps | from thinc.neural.ops import NumpyOps, CupyOps | ||||||
| from thinc.neural.util import get_array_module | from thinc.neural.util import get_array_module | ||||||
| 
 | 
 | ||||||
|  | @ -48,7 +48,8 @@ from .. import util | ||||||
| from ..util import get_async, get_cuda_stream | from ..util import get_async, get_cuda_stream | ||||||
| from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts | from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts | ||||||
| from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune | from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune | ||||||
| from .._ml import Residual, drop_layer | from .._ml import Residual, drop_layer, flatten | ||||||
|  | from .._ml import link_vectors_to_models | ||||||
| from ..compat import json_dumps | from ..compat import json_dumps | ||||||
| 
 | 
 | ||||||
| from . import _parse_features | from . import _parse_features | ||||||
|  | @ -238,14 +239,15 @@ cdef class Parser: | ||||||
|     Base class of the DependencyParser and EntityRecognizer. |     Base class of the DependencyParser and EntityRecognizer. | ||||||
|     """ |     """ | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): |     def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg): | ||||||
|         depth = util.env_opt('parser_hidden_depth', depth) |         depth = util.env_opt('parser_hidden_depth', depth) | ||||||
|         token_vector_width = util.env_opt('token_vector_width', token_vector_width) |         token_vector_width = util.env_opt('token_vector_width', token_vector_width) | ||||||
|         hidden_width = util.env_opt('hidden_width', hidden_width) |         hidden_width = util.env_opt('hidden_width', hidden_width) | ||||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) |         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) | ||||||
|         embed_size = util.env_opt('embed_size', 4000) |         embed_size = util.env_opt('embed_size', 7000) | ||||||
|         tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, |         tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||||
|                                     preprocess=doc2feats())) |                           pretrained_dims=cfg.get('pretrained_dims', 0)) | ||||||
|  |         tok2vec = chain(tok2vec, flatten) | ||||||
|         if parser_maxout_pieces == 1: |         if parser_maxout_pieces == 1: | ||||||
|             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, |             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, | ||||||
|                         nF=cls.nr_feature, |                         nF=cls.nr_feature, | ||||||
|  | @ -262,8 +264,8 @@ cdef class Parser: | ||||||
|                 upper.is_noop = True |                 upper.is_noop = True | ||||||
|             else: |             else: | ||||||
|                 upper = chain( |                 upper = chain( | ||||||
|                     clone(Maxout(hidden_width), (depth-1)), |                     clone(Maxout(hidden_width), depth-1), | ||||||
|                     zero_init(Affine(nr_class, drop_factor=0.0)) |                     zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) | ||||||
|                 ) |                 ) | ||||||
|                 upper.is_noop = False |                 upper.is_noop = False | ||||||
|         # TODO: This is an unfortunate hack atm! |         # TODO: This is an unfortunate hack atm! | ||||||
|  | @ -277,7 +279,7 @@ cdef class Parser: | ||||||
|             'hidden_width': hidden_width, |             'hidden_width': hidden_width, | ||||||
|             'maxout_pieces': parser_maxout_pieces |             'maxout_pieces': parser_maxout_pieces | ||||||
|         } |         } | ||||||
|         return (tensors, lower, upper), cfg |         return (tok2vec, lower, upper), cfg | ||||||
| 
 | 
 | ||||||
|     def __init__(self, Vocab vocab, moves=True, model=True, **cfg): |     def __init__(self, Vocab vocab, moves=True, model=True, **cfg): | ||||||
|         """ |         """ | ||||||
|  | @ -307,12 +309,16 @@ cdef class Parser: | ||||||
|             cfg['beam_width'] = util.env_opt('beam_width', 1) |             cfg['beam_width'] = util.env_opt('beam_width', 1) | ||||||
|         if 'beam_density' not in cfg: |         if 'beam_density' not in cfg: | ||||||
|             cfg['beam_density'] = util.env_opt('beam_density', 0.0) |             cfg['beam_density'] = util.env_opt('beam_density', 0.0) | ||||||
|  |         if 'pretrained_dims' not in cfg: | ||||||
|  |             cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||||
|  |         cfg.setdefault('cnn_maxout_pieces', 3) | ||||||
|         self.cfg = cfg |         self.cfg = cfg | ||||||
|         if 'actions' in self.cfg: |         if 'actions' in self.cfg: | ||||||
|             for action, labels in self.cfg.get('actions', {}).items(): |             for action, labels in self.cfg.get('actions', {}).items(): | ||||||
|                 for label in labels: |                 for label in labels: | ||||||
|                     self.moves.add_action(action, label) |                     self.moves.add_action(action, label) | ||||||
|         self.model = model |         self.model = model | ||||||
|  |         self._multitasks = [] | ||||||
| 
 | 
 | ||||||
|     def __reduce__(self): |     def __reduce__(self): | ||||||
|         return (Parser, (self.vocab, self.moves, self.model), None, None) |         return (Parser, (self.vocab, self.moves, self.model), None, None) | ||||||
|  | @ -332,11 +338,11 @@ cdef class Parser: | ||||||
|             beam_density = self.cfg.get('beam_density', 0.0) |             beam_density = self.cfg.get('beam_density', 0.0) | ||||||
|         cdef Beam beam |         cdef Beam beam | ||||||
|         if beam_width == 1: |         if beam_width == 1: | ||||||
|             states = self.parse_batch([doc], [doc.tensor]) |             states = self.parse_batch([doc]) | ||||||
|             self.set_annotations([doc], states) |             self.set_annotations([doc], states) | ||||||
|             return doc |             return doc | ||||||
|         else: |         else: | ||||||
|             beam = self.beam_parse([doc], [doc.tensor], |             beam = self.beam_parse([doc], | ||||||
|                         beam_width=beam_width, beam_density=beam_density)[0] |                         beam_width=beam_width, beam_density=beam_density)[0] | ||||||
|             output = self.moves.get_beam_annot(beam) |             output = self.moves.get_beam_annot(beam) | ||||||
|             state = <StateClass>beam.at(0) |             state = <StateClass>beam.at(0) | ||||||
|  | @ -365,11 +371,11 @@ cdef class Parser: | ||||||
|         cdef Beam beam |         cdef Beam beam | ||||||
|         for docs in cytoolz.partition_all(batch_size, docs): |         for docs in cytoolz.partition_all(batch_size, docs): | ||||||
|             docs = list(docs) |             docs = list(docs) | ||||||
|             tokvecs = [doc.tensor for doc in docs] |  | ||||||
|             if beam_width == 1: |             if beam_width == 1: | ||||||
|                 parse_states = self.parse_batch(docs, tokvecs) |                 parse_states = self.parse_batch(docs) | ||||||
|  |                 beams = [] | ||||||
|             else: |             else: | ||||||
|                 beams = self.beam_parse(docs, tokvecs, |                 beams = self.beam_parse(docs, | ||||||
|                             beam_width=beam_width, beam_density=beam_density) |                             beam_width=beam_width, beam_density=beam_density) | ||||||
|                 parse_states = [] |                 parse_states = [] | ||||||
|                 for beam in beams: |                 for beam in beams: | ||||||
|  | @ -377,7 +383,7 @@ cdef class Parser: | ||||||
|             self.set_annotations(docs, parse_states) |             self.set_annotations(docs, parse_states) | ||||||
|             yield from docs |             yield from docs | ||||||
| 
 | 
 | ||||||
|     def parse_batch(self, docs, tokvecses): |     def parse_batch(self, docs): | ||||||
|         cdef: |         cdef: | ||||||
|             precompute_hiddens state2vec |             precompute_hiddens state2vec | ||||||
|             StateClass state |             StateClass state | ||||||
|  | @ -388,21 +394,15 @@ cdef class Parser: | ||||||
|             int nr_class, nr_feat, nr_piece, nr_dim, nr_state |             int nr_class, nr_feat, nr_piece, nr_dim, nr_state | ||||||
|         if isinstance(docs, Doc): |         if isinstance(docs, Doc): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
|         if isinstance(tokvecses, np.ndarray): |  | ||||||
|             tokvecses = [tokvecses] |  | ||||||
| 
 | 
 | ||||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) |         cuda_stream = get_cuda_stream() | ||||||
|         if USE_FINE_TUNE: |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||||
|             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) |                                                                             0.0) | ||||||
| 
 | 
 | ||||||
|         nr_state = len(docs) |         nr_state = len(docs) | ||||||
|         nr_class = self.moves.n_moves |         nr_class = self.moves.n_moves | ||||||
|         nr_dim = tokvecs.shape[1] |         nr_dim = tokvecs.shape[1] | ||||||
|         nr_feat = self.nr_feature |         nr_feat = self.nr_feature | ||||||
| 
 |  | ||||||
|         cuda_stream = get_cuda_stream() |  | ||||||
|         state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs, |  | ||||||
|                                                      cuda_stream, 0.0) |  | ||||||
|         nr_piece = state2vec.nP |         nr_piece = state2vec.nP | ||||||
| 
 | 
 | ||||||
|         states = self.moves.init_batch(docs) |         states = self.moves.init_batch(docs) | ||||||
|  | @ -418,21 +418,23 @@ cdef class Parser: | ||||||
|         c_token_ids = <int*>token_ids.data |         c_token_ids = <int*>token_ids.data | ||||||
|         c_is_valid = <int*>is_valid.data |         c_is_valid = <int*>is_valid.data | ||||||
|         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) |         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) | ||||||
|  |         cdef int nr_step | ||||||
|         while not next_step.empty(): |         while not next_step.empty(): | ||||||
|  |             nr_step = next_step.size() | ||||||
|             if not has_hidden: |             if not has_hidden: | ||||||
|                 for i in cython.parallel.prange( |                 for i in cython.parallel.prange(nr_step, num_threads=6, | ||||||
|                         next_step.size(), num_threads=6, nogil=True): |                                                 nogil=True): | ||||||
|                     self._parse_step(next_step[i], |                     self._parse_step(next_step[i], | ||||||
|                         feat_weights, nr_class, nr_feat, nr_piece) |                         feat_weights, nr_class, nr_feat, nr_piece) | ||||||
|             else: |             else: | ||||||
|                 for i in range(next_step.size()): |                 for i in range(nr_step): | ||||||
|                     st = next_step[i] |                     st = next_step[i] | ||||||
|                     st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) |                     st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||||
|                     self.moves.set_valid(&c_is_valid[i*nr_class], st) |                     self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||||
|                 vectors = state2vec(token_ids[:next_step.size()]) |                 vectors = state2vec(token_ids[:next_step.size()]) | ||||||
|                 scores = vec2scores(vectors) |                 scores = vec2scores(vectors) | ||||||
|                 c_scores = <float*>scores.data |                 c_scores = <float*>scores.data | ||||||
|                 for i in range(next_step.size()): |                 for i in range(nr_step): | ||||||
|                     st = next_step[i] |                     st = next_step[i] | ||||||
|                     guess = arg_max_if_valid( |                     guess = arg_max_if_valid( | ||||||
|                         &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) |                         &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||||
|  | @ -445,18 +447,15 @@ cdef class Parser: | ||||||
|                     next_step.push_back(st) |                     next_step.push_back(st) | ||||||
|         return states |         return states | ||||||
| 
 | 
 | ||||||
|     def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): |     def beam_parse(self, docs, int beam_width=3, float beam_density=0.001): | ||||||
|         cdef Beam beam |         cdef Beam beam | ||||||
|         cdef np.ndarray scores |         cdef np.ndarray scores | ||||||
|         cdef Doc doc |         cdef Doc doc | ||||||
|         cdef int nr_class = self.moves.n_moves |         cdef int nr_class = self.moves.n_moves | ||||||
|         cdef StateClass stcls, output |         cdef StateClass stcls, output | ||||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) |  | ||||||
|         if USE_FINE_TUNE: |  | ||||||
|             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) |  | ||||||
|         cuda_stream = get_cuda_stream() |         cuda_stream = get_cuda_stream() | ||||||
|         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||||
|                                                      cuda_stream, 0.0) |                                                                             0.0) | ||||||
|         beams = [] |         beams = [] | ||||||
|         cdef int offset = 0 |         cdef int offset = 0 | ||||||
|         cdef int j = 0 |         cdef int j = 0 | ||||||
|  | @ -516,29 +515,24 @@ cdef class Parser: | ||||||
|         free(scores) |         free(scores) | ||||||
|         free(token_ids) |         free(token_ids) | ||||||
| 
 | 
 | ||||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||||
|         if not any(self.moves.has_gold(gold) for gold in golds): |         if not any(self.moves.has_gold(gold) for gold in golds): | ||||||
|             return None |             return None | ||||||
|         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: |         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: | ||||||
|             return self.update_beam(docs_tokvecs, golds, |             return self.update_beam(docs, golds, | ||||||
|                     self.cfg['beam_width'], self.cfg['beam_density'], |                     self.cfg['beam_width'], self.cfg['beam_density'], | ||||||
|                     drop=drop, sgd=sgd, losses=losses) |                     drop=drop, sgd=sgd, losses=losses) | ||||||
|         if losses is not None and self.name not in losses: |         if losses is not None and self.name not in losses: | ||||||
|             losses[self.name] = 0. |             losses[self.name] = 0. | ||||||
|         docs, tokvec_lists = docs_tokvecs |  | ||||||
|         tokvecs = self.model[0].ops.flatten(tokvec_lists) |  | ||||||
|         if isinstance(docs, Doc) and isinstance(golds, GoldParse): |         if isinstance(docs, Doc) and isinstance(golds, GoldParse): | ||||||
|             docs = [docs] |             docs = [docs] | ||||||
|             golds = [golds] |             golds = [golds] | ||||||
|         if USE_FINE_TUNE: |  | ||||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) |  | ||||||
|             tokvecs = self.model[0].ops.flatten(my_tokvecs) |  | ||||||
| 
 | 
 | ||||||
|         cuda_stream = get_cuda_stream() |         cuda_stream = get_cuda_stream() | ||||||
| 
 | 
 | ||||||
|         states, golds, max_steps = self._init_gold_batch(docs, golds) |         states, golds, max_steps = self._init_gold_batch(docs, golds) | ||||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||||
|                                                       0.0) |                                                                             drop) | ||||||
|         todo = [(s, g) for (s, g) in zip(states, golds) |         todo = [(s, g) for (s, g) in zip(states, golds) | ||||||
|                 if not s.is_final() and g is not None] |                 if not s.is_final() and g is not None] | ||||||
|         if not todo: |         if not todo: | ||||||
|  | @ -582,13 +576,9 @@ cdef class Parser: | ||||||
|             if n_steps >= max_steps: |             if n_steps >= max_steps: | ||||||
|                 break |                 break | ||||||
|         self._make_updates(d_tokvecs, |         self._make_updates(d_tokvecs, | ||||||
|             backprops, sgd, cuda_stream) |             bp_tokvecs, backprops, sgd, cuda_stream) | ||||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) |  | ||||||
|         if USE_FINE_TUNE: |  | ||||||
|             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) |  | ||||||
|         return d_tokvecs |  | ||||||
| 
 | 
 | ||||||
|     def update_beam(self, docs_tokvecs, golds, width=None, density=None, |     def update_beam(self, docs, golds, width=None, density=None, | ||||||
|             drop=0., sgd=None, losses=None): |             drop=0., sgd=None, losses=None): | ||||||
|         if not any(self.moves.has_gold(gold) for gold in golds): |         if not any(self.moves.has_gold(gold) for gold in golds): | ||||||
|             return None |             return None | ||||||
|  | @ -600,26 +590,20 @@ cdef class Parser: | ||||||
|             density = self.cfg.get('beam_density', 0.0) |             density = self.cfg.get('beam_density', 0.0) | ||||||
|         if losses is not None and self.name not in losses: |         if losses is not None and self.name not in losses: | ||||||
|             losses[self.name] = 0. |             losses[self.name] = 0. | ||||||
|         docs, tokvecs = docs_tokvecs |  | ||||||
|         lengths = [len(d) for d in docs] |         lengths = [len(d) for d in docs] | ||||||
|         assert min(lengths) >= 1 |         assert min(lengths) >= 1 | ||||||
|         tokvecs = self.model[0].ops.flatten(tokvecs) |  | ||||||
|         if USE_FINE_TUNE: |  | ||||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) |  | ||||||
|             tokvecs += self.model[0].ops.flatten(my_tokvecs) |  | ||||||
| 
 |  | ||||||
|         states = self.moves.init_batch(docs) |         states = self.moves.init_batch(docs) | ||||||
|         for gold in golds: |         for gold in golds: | ||||||
|             self.moves.preprocess_gold(gold) |             self.moves.preprocess_gold(gold) | ||||||
| 
 | 
 | ||||||
|         cuda_stream = get_cuda_stream() |         cuda_stream = get_cuda_stream() | ||||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) |         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) | ||||||
| 
 | 
 | ||||||
|         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, |         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, | ||||||
|                                         states, tokvecs, golds, |                                         states, golds, | ||||||
|                                         state2vec, vec2scores, |                                         state2vec, vec2scores, | ||||||
|                                         width, density, |                                         width, density, | ||||||
|                                         sgd=sgd, drop=drop, losses=losses) |                                         drop=drop, losses=losses) | ||||||
|         backprop_lower = [] |         backprop_lower = [] | ||||||
|         cdef float batch_size = len(docs) |         cdef float batch_size = len(docs) | ||||||
|         for i, d_scores in enumerate(states_d_scores): |         for i, d_scores in enumerate(states_d_scores): | ||||||
|  | @ -637,11 +621,7 @@ cdef class Parser: | ||||||
|             else: |             else: | ||||||
|                 backprop_lower.append((ids, d_vector, bp_vectors)) |                 backprop_lower.append((ids, d_vector, bp_vectors)) | ||||||
|         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) |         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) | ||||||
|         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) |         self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) | ||||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) |  | ||||||
|         if USE_FINE_TUNE: |  | ||||||
|             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) |  | ||||||
|         return d_tokvecs |  | ||||||
| 
 | 
 | ||||||
|     def _init_gold_batch(self, whole_docs, whole_golds): |     def _init_gold_batch(self, whole_docs, whole_golds): | ||||||
|         """Make a square batch, of length equal to the shortest doc. A long |         """Make a square batch, of length equal to the shortest doc. A long | ||||||
|  | @ -679,7 +659,7 @@ cdef class Parser: | ||||||
|             max_moves = max(max_moves, len(oracle_actions)) |             max_moves = max(max_moves, len(oracle_actions)) | ||||||
|         return states, golds, max_moves |         return states, golds, max_moves | ||||||
| 
 | 
 | ||||||
|     def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): |     def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None): | ||||||
|         # Tells CUDA to block, so our async copies complete. |         # Tells CUDA to block, so our async copies complete. | ||||||
|         if cuda_stream is not None: |         if cuda_stream is not None: | ||||||
|             cuda_stream.synchronize() |             cuda_stream.synchronize() | ||||||
|  | @ -690,6 +670,7 @@ cdef class Parser: | ||||||
|             d_state_features *= mask.reshape(ids.shape + (1,)) |             d_state_features *= mask.reshape(ids.shape + (1,)) | ||||||
|             self.model[0].ops.scatter_add(d_tokvecs, ids * mask, |             self.model[0].ops.scatter_add(d_tokvecs, ids * mask, | ||||||
|                 d_state_features) |                 d_state_features) | ||||||
|  |         bp_tokvecs(d_tokvecs, sgd=sgd) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def move_names(self): |     def move_names(self): | ||||||
|  | @ -699,11 +680,12 @@ cdef class Parser: | ||||||
|             names.append(name) |             names.append(name) | ||||||
|         return names |         return names | ||||||
| 
 | 
 | ||||||
|     def get_batch_model(self, batch_size, tokvecs, stream, dropout): |     def get_batch_model(self, docs, stream, dropout): | ||||||
|         _, lower, upper = self.model |         tok2vec, lower, upper = self.model | ||||||
|         state2vec = precompute_hiddens(batch_size, tokvecs, |         tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout) | ||||||
|                         lower, stream, drop=dropout) |         state2vec = precompute_hiddens(len(docs), tokvecs, | ||||||
|         return state2vec, upper |                                        lower, stream, drop=0.0) | ||||||
|  |         return (tokvecs, bp_tokvecs), state2vec, upper | ||||||
| 
 | 
 | ||||||
|     nr_feature = 8 |     nr_feature = 8 | ||||||
| 
 | 
 | ||||||
|  | @ -766,7 +748,7 @@ cdef class Parser: | ||||||
|                 # order, or the model goes out of synch |                 # order, or the model goes out of synch | ||||||
|                 self.cfg.setdefault('extra_labels', []).append(label) |                 self.cfg.setdefault('extra_labels', []).append(label) | ||||||
| 
 | 
 | ||||||
|     def begin_training(self, gold_tuples, **cfg): |     def begin_training(self, gold_tuples, pipeline=None, **cfg): | ||||||
|         if 'model' in cfg: |         if 'model' in cfg: | ||||||
|             self.model = cfg['model'] |             self.model = cfg['model'] | ||||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) |         gold_tuples = nonproj.preprocess_training_data(gold_tuples) | ||||||
|  | @ -775,9 +757,22 @@ cdef class Parser: | ||||||
|             for label in labels: |             for label in labels: | ||||||
|                 self.moves.add_action(action, label) |                 self.moves.add_action(action, label) | ||||||
|         if self.model is True: |         if self.model is True: | ||||||
|  |             cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|             self.model, cfg = self.Model(self.moves.n_moves, **cfg) |             self.model, cfg = self.Model(self.moves.n_moves, **cfg) | ||||||
|  |             self.init_multitask_objectives(gold_tuples, pipeline, **cfg) | ||||||
|  |             link_vectors_to_models(self.vocab) | ||||||
|             self.cfg.update(cfg) |             self.cfg.update(cfg) | ||||||
| 
 | 
 | ||||||
|  |     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||||
|  |         '''Setup models for secondary objectives, to benefit from multi-task | ||||||
|  |         learning. This method is intended to be overridden by subclasses. | ||||||
|  | 
 | ||||||
|  |         For instance, the dependency parser can benefit from sharing | ||||||
|  |         an input representation with a label prediction model. These auxiliary | ||||||
|  |         models are discarded after training. | ||||||
|  |         ''' | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|     def preprocess_gold(self, docs_golds): |     def preprocess_gold(self, docs_golds): | ||||||
|         for doc, gold in docs_golds: |         for doc, gold in docs_golds: | ||||||
|             yield doc, gold |             yield doc, gold | ||||||
|  | @ -813,6 +808,7 @@ cdef class Parser: | ||||||
|         if 'model' not in exclude: |         if 'model' not in exclude: | ||||||
|             path = util.ensure_path(path) |             path = util.ensure_path(path) | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|  |                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|                 self.model, cfg = self.Model(**self.cfg) |                 self.model, cfg = self.Model(**self.cfg) | ||||||
|             else: |             else: | ||||||
|                 cfg = {} |                 cfg = {} | ||||||
|  | @ -835,7 +831,7 @@ cdef class Parser: | ||||||
|             ('upper_model', lambda: self.model[2].to_bytes()), |             ('upper_model', lambda: self.model[2].to_bytes()), | ||||||
|             ('vocab', lambda: self.vocab.to_bytes()), |             ('vocab', lambda: self.vocab.to_bytes()), | ||||||
|             ('moves', lambda: self.moves.to_bytes(strings=False)), |             ('moves', lambda: self.moves.to_bytes(strings=False)), | ||||||
|             ('cfg', lambda: ujson.dumps(self.cfg)) |             ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True)) | ||||||
|         )) |         )) | ||||||
|         if 'model' in exclude: |         if 'model' in exclude: | ||||||
|             exclude['tok2vec_model'] = True |             exclude['tok2vec_model'] = True | ||||||
|  | @ -848,7 +844,7 @@ cdef class Parser: | ||||||
|         deserializers = OrderedDict(( |         deserializers = OrderedDict(( | ||||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), |             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||||
|             ('moves', lambda b: self.moves.from_bytes(b, strings=False)), |             ('moves', lambda b: self.moves.from_bytes(b, strings=False)), | ||||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), |             ('cfg', lambda b: self.cfg.update(json.loads(b))), | ||||||
|             ('tok2vec_model', lambda b: None), |             ('tok2vec_model', lambda b: None), | ||||||
|             ('lower_model', lambda b: None), |             ('lower_model', lambda b: None), | ||||||
|             ('upper_model', lambda b: None) |             ('upper_model', lambda b: None) | ||||||
|  | @ -856,9 +852,11 @@ cdef class Parser: | ||||||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) |         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||||
|         if 'model' not in exclude: |         if 'model' not in exclude: | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 self.model, cfg = self.Model(self.moves.n_moves) |                 self.model, cfg = self.Model(**self.cfg) | ||||||
|  |                 cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|             else: |             else: | ||||||
|                 cfg = {} |                 cfg = {} | ||||||
|  |             cfg['pretrained_dims'] = self.vocab.vectors_length | ||||||
|             if 'tok2vec_model' in msg: |             if 'tok2vec_model' in msg: | ||||||
|                 self.model[0].from_bytes(msg['tok2vec_model']) |                 self.model[0].from_bytes(msg['tok2vec_model']) | ||||||
|             if 'lower_model' in msg: |             if 'lower_model' in msg: | ||||||
|  |  | ||||||
|  | @ -148,7 +148,7 @@ cdef class TransitionSystem: | ||||||
| 
 | 
 | ||||||
|     def add_action(self, int action, label_name): |     def add_action(self, int action, label_name): | ||||||
|         cdef attr_t label_id |         cdef attr_t label_id | ||||||
|         if not isinstance(label_name, int): |         if not isinstance(label_name, (int, long)): | ||||||
|             label_id = self.strings.add(label_name) |             label_id = self.strings.add(label_name) | ||||||
|         else: |         else: | ||||||
|             label_id = label_name |             label_id = label_name | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ from .. import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', | _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', | ||||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] |               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx'] | ||||||
| _models = {'en': ['en_core_web_sm'], | _models = {'en': ['en_core_web_sm'], | ||||||
|            'de': ['de_core_news_md'], |            'de': ['de_core_news_md'], | ||||||
|            'fr': ['fr_depvec_web_lg'], |            'fr': ['fr_depvec_web_lg'], | ||||||
|  | @ -108,6 +108,11 @@ def he_tokenizer(): | ||||||
| def nb_tokenizer(): | def nb_tokenizer(): | ||||||
|     return util.get_lang_class('nb').Defaults.create_tokenizer() |     return util.get_lang_class('nb').Defaults.create_tokenizer() | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def th_tokenizer(): | ||||||
|  |     pythainlp = pytest.importorskip("pythainlp") | ||||||
|  |     return util.get_lang_class('th').Defaults.create_tokenizer() | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def stringstore(): | def stringstore(): | ||||||
|  |  | ||||||
|  | @ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): | ||||||
|     assert len(tokens) == 4 |     assert len(tokens) == 4 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text', ["blau-rot"]) |  | ||||||
| def test_tokenizer_splits_hyphens(de_tokenizer, text): |  | ||||||
|     tokens = de_tokenizer(text) |  | ||||||
|     assert len(tokens) == 3 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | ||||||
| def test_tokenizer_splits_numeric_range(de_tokenizer, text): | def test_tokenizer_splits_numeric_range(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|  | @ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) | ||||||
|  | def test_tokenizer_keeps_hyphens(de_tokenizer, text): | ||||||
|  |     tokens = de_tokenizer(text) | ||||||
|  |     assert len(tokens) == 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | ||||||
|     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") |     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") | ||||||
|     assert len(tokens) == 12 |     assert len(tokens) == 10 | ||||||
|     assert tokens[0].text == "Viele" |     assert tokens[0].text == "Viele" | ||||||
|     assert tokens[1].text == "Regeln" |     assert tokens[1].text == "Regeln" | ||||||
|     assert tokens[2].text == "--" |     assert tokens[2].text == "--" | ||||||
|     assert tokens[3].text == "wie" |     assert tokens[3].text == "wie" | ||||||
|     assert tokens[4].text == "die" |     assert tokens[4].text == "die" | ||||||
|     assert tokens[5].text == "Bindestrich" |     assert tokens[5].text == "Bindestrich-Regeln" | ||||||
|     assert tokens[6].text == "-" |     assert tokens[6].text == "--" | ||||||
|     assert tokens[7].text == "Regeln" |     assert tokens[7].text == "sind" | ||||||
|     assert tokens[8].text == "--" |     assert tokens[8].text == "kompliziert" | ||||||
|     assert tokens[9].text == "sind" |  | ||||||
|     assert tokens[10].text == "kompliziert" |  | ||||||
|  |  | ||||||
|  | @ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. | ||||||
|     assert len(tokens) == 109 |     assert len(tokens) == 109 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,length', [ | @pytest.mark.parametrize('text', [ | ||||||
|     ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), |     "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", | ||||||
|     ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), |     "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", | ||||||
|     ("Kraftfahrzeug-Haftpflichtversicherung", 3), |     "Kraftfahrzeug-Haftpflichtversicherung", | ||||||
|     ("Vakuum-Mittelfrequenz-Induktionsofen", 5) |     "Vakuum-Mittelfrequenz-Induktionsofen" | ||||||
|     ]) |     ]) | ||||||
| def test_tokenizer_handles_long_words(de_tokenizer, text, length): | def test_tokenizer_handles_long_words(de_tokenizer, text): | ||||||
|     tokens = de_tokenizer(text) |     tokens = de_tokenizer(text) | ||||||
|     assert len(tokens) == length |     assert len(tokens) == 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('text,length', [ | @pytest.mark.parametrize('text,length', [ | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										13
									
								
								spacy/tests/lang/th/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/th/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | TOKENIZER_TESTS = [ | ||||||
|  |         ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) | ||||||
|  | def test_thai_tokenizer(th_tokenizer, text, expected_tokens): | ||||||
|  | 	tokens = [token.text for token in th_tokenizer(text)] | ||||||
|  | 	assert tokens == expected_tokens | ||||||
|  | @ -26,7 +26,7 @@ def arc_eager(vocab): | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def tok2vec(): | def tok2vec(): | ||||||
|     return Tok2Vec(8, 100, preprocess=doc2feats()) |     return Tok2Vec(8, 100) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  | @ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc): | ||||||
|     parser(doc) |     parser(doc) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_update_doc(parser, tok2vec, model, doc, gold): | def test_update_doc(parser, model, doc, gold): | ||||||
|     parser.model = model |     parser.model = model | ||||||
|     tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) |  | ||||||
|     d_tokvecs = parser.update(([doc], tokvecs), [gold]) |  | ||||||
|     assert d_tokvecs[0].shape == tokvecs[0].shape |  | ||||||
|     def optimize(weights, gradient, key=None): |     def optimize(weights, gradient, key=None): | ||||||
|         weights -= 0.001 * gradient |         weights -= 0.001 * gradient | ||||||
|     bp_tokvecs(d_tokvecs, sgd=optimize) |     parser.update([doc], [gold], sgd=optimize) | ||||||
|     assert d_tokvecs[0].sum() == 0. |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_predict_doc_beam(parser, tok2vec, model, doc): | def test_predict_doc_beam(parser, model, doc): | ||||||
|     doc.tensor = tok2vec([doc])[0] |  | ||||||
|     parser.model = model |     parser.model = model | ||||||
|     parser(doc, beam_width=32, beam_density=0.001) |     parser(doc, beam_width=32, beam_density=0.001) | ||||||
|     for word in doc: |  | ||||||
|         print(word.text, word.head, word.dep_) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_update_doc_beam(parser, tok2vec, model, doc, gold): | def test_update_doc_beam(parser, model, doc, gold): | ||||||
|     parser.model = model |     parser.model = model | ||||||
|     tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) |  | ||||||
|     d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) |  | ||||||
|     assert d_tokvecs[0].shape == tokvecs[0].shape |  | ||||||
|     def optimize(weights, gradient, key=None): |     def optimize(weights, gradient, key=None): | ||||||
|         weights -= 0.001 * gradient |         weights -= 0.001 * gradient | ||||||
|     bp_tokvecs(d_tokvecs, sgd=optimize) |     parser.update_beam([doc], [gold], sgd=optimize) | ||||||
|     assert d_tokvecs[0].sum() == 0. |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,8 @@ | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | @pytest.mark.models('en') | ||||||
|  | def test_issue1305(EN): | ||||||
|  |     '''Test lemmatization of English VBZ''' | ||||||
|  |     assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work']) | ||||||
|  |     doc = EN(u'This app works well') | ||||||
|  |     assert doc[2].lemma_ == 'work' | ||||||
							
								
								
									
										14
									
								
								spacy/tests/regression/test_issue1380.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								spacy/tests/regression/test_issue1380.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,14 @@ | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from ...language import Language | ||||||
|  | 
 | ||||||
|  | def test_issue1380_empty_string(): | ||||||
|  |     nlp = Language() | ||||||
|  |     doc = nlp('') | ||||||
|  |     assert len(doc) == 0 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.models('en') | ||||||
|  | def test_issue1380_en(EN): | ||||||
|  |     doc = EN('') | ||||||
|  |     assert len(doc) == 0 | ||||||
|  | @ -9,11 +9,14 @@ import pytest | ||||||
| @pytest.mark.models('en') | @pytest.mark.models('en') | ||||||
| def test_issue429(EN): | def test_issue429(EN): | ||||||
|     def merge_phrases(matcher, doc, i, matches): |     def merge_phrases(matcher, doc, i, matches): | ||||||
|       if i != len(matches) - 1: |         if i != len(matches) - 1: | ||||||
|         return None |             return None | ||||||
|       spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] |         spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] | ||||||
|       for ent_id, label, span in spans: |         for ent_id, label, span in spans: | ||||||
|         span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label]) |             span.merge( | ||||||
|  |                 tag=('NNP' if label else span.root.tag_), | ||||||
|  |                 lemma=span.text, | ||||||
|  |                 label='PERSON') | ||||||
| 
 | 
 | ||||||
|     doc = EN('a') |     doc = EN('a') | ||||||
|     matcher = Matcher(EN.vocab) |     matcher = Matcher(EN.vocab) | ||||||
|  |  | ||||||
|  | @ -11,7 +11,7 @@ import pytest | ||||||
| def taggers(en_vocab): | def taggers(en_vocab): | ||||||
|     tagger1 = Tagger(en_vocab) |     tagger1 = Tagger(en_vocab) | ||||||
|     tagger2 = Tagger(en_vocab) |     tagger2 = Tagger(en_vocab) | ||||||
|     tagger1.model = tagger1.Model(8, 8) |     tagger1.model = tagger1.Model(8) | ||||||
|     tagger2.model = tagger1.model |     tagger2.model = tagger1.model | ||||||
|     return (tagger1, tagger2) |     return (tagger1, tagger2) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,6 +6,16 @@ from ...strings import StringStore | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_string_hash(stringstore): | ||||||
|  |     '''Test that string hashing is stable across platforms''' | ||||||
|  |     ss = stringstore | ||||||
|  |     assert ss.add('apple') == 8566208034543834098 | ||||||
|  |     heart = '\U0001f499' | ||||||
|  |     print(heart) | ||||||
|  |     h = ss.add(heart) | ||||||
|  |     assert h == 11841826740069053588 | ||||||
|  |   | ||||||
|  | 
 | ||||||
| def test_stringstore_from_api_docs(stringstore): | def test_stringstore_from_api_docs(stringstore): | ||||||
|     apple_hash = stringstore.add('apple') |     apple_hash = stringstore.add('apple') | ||||||
|     assert apple_hash == 8566208034543834098 |     assert apple_hash == 8566208034543834098 | ||||||
|  |  | ||||||
|  | @ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab): | ||||||
|     assert len(patterns[0]) |     assert len(patterns[0]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.xfail |  | ||||||
| def test_matcher_from_usage_docs(en_vocab): | def test_matcher_from_usage_docs(en_vocab): | ||||||
|     text = "Wow 😀 This is really cool! 😂 😂" |     text = "Wow 😀 This is really cool! 😂 😂" | ||||||
|     doc = get_doc(en_vocab, words=text.split(' ')) |     doc = get_doc(en_vocab, words=text.split(' ')) | ||||||
|  | @ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab): | ||||||
|         if doc.vocab.strings[match_id] == 'HAPPY': |         if doc.vocab.strings[match_id] == 'HAPPY': | ||||||
|             doc.sentiment += 0.1 |             doc.sentiment += 0.1 | ||||||
|         span = doc[start : end] |         span = doc[start : end] | ||||||
|         token = span.merge(norm='happy emoji') |         token = span.merge() | ||||||
|  |         token.vocab[token.text].norm_ = 'happy emoji' | ||||||
| 
 | 
 | ||||||
|     matcher = Matcher(en_vocab) |     matcher = Matcher(en_vocab) | ||||||
|     matcher.add('HAPPY', label_sentiment, *pos_patterns) |     matcher.add('HAPPY', label_sentiment, *pos_patterns) | ||||||
|  | @ -98,11 +98,11 @@ def test_matcher_match_multi(matcher): | ||||||
|                             (doc.vocab.strings['Java'], 5, 6)] |                             (doc.vocab.strings['Java'], 5, 6)] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.xfail |  | ||||||
| def test_matcher_phrase_matcher(en_vocab): | def test_matcher_phrase_matcher(en_vocab): | ||||||
|     words = ["Google", "Now"] |     words = ["Google", "Now"] | ||||||
|     doc = get_doc(en_vocab, words) |     doc = get_doc(en_vocab, words) | ||||||
|     matcher = PhraseMatcher(en_vocab, [doc]) |     matcher = PhraseMatcher(en_vocab) | ||||||
|  |     matcher.add('COMPANY', None, doc) | ||||||
|     words = ["I", "like", "Google", "Now", "best"] |     words = ["I", "like", "Google", "Now", "best"] | ||||||
|     doc = get_doc(en_vocab, words) |     doc = get_doc(en_vocab, words) | ||||||
|     assert len(matcher(doc)) == 1 |     assert len(matcher(doc)) == 1 | ||||||
|  |  | ||||||
|  | @ -9,7 +9,8 @@ from .util import get_doc | ||||||
| 
 | 
 | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import pytest | import pytest | ||||||
| from thinc.neural import Maxout, Softmax | from thinc.neural._classes.maxout import Maxout | ||||||
|  | from thinc.neural._classes.softmax import Softmax | ||||||
| from thinc.api import chain | from thinc.api import chain | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | import sys | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): | ||||||
|     tokens = tokenizer(text) |     tokens = tokenizer(text) | ||||||
|     assert len(tokens) == length |     assert len(tokens) == length | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), | @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), | ||||||
|                                          ('i💙you', 3), ('🤘🤘yay!', 4)]) |                                          ('i💙you', 3), ('🤘🤘yay!', 4)]) | ||||||
| def test_tokenizer_handles_emoji(tokenizer, text, length): | def test_tokenizer_handles_emoji(tokenizer, text, length): | ||||||
|     tokens = tokenizer(text) |     # These break on narrow unicode builds, e.g. Windows | ||||||
|     assert len(tokens) == length |     if sys.maxunicode >= 1114111: | ||||||
|  |         tokens = tokenizer(text) | ||||||
|  |         assert len(tokens) == length | ||||||
|  |  | ||||||
|  | @ -54,7 +54,7 @@ cdef class Doc: | ||||||
| 
 | 
 | ||||||
|     cdef public object noun_chunks_iterator |     cdef public object noun_chunks_iterator | ||||||
| 
 | 
 | ||||||
|     cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 |     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 | ||||||
| 
 | 
 | ||||||
|     cpdef np.ndarray to_array(self, object features) |     cpdef np.ndarray to_array(self, object features) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -660,7 +660,7 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         with path.open('rb') as file_: |         with path.open('rb') as file_: | ||||||
|             bytes_data = file_.read() |             bytes_data = file_.read() | ||||||
|         self.from_bytes(bytes_data, **exclude) |         return self.from_bytes(bytes_data, **exclude) | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self, **exclude): |     def to_bytes(self, **exclude): | ||||||
|         """Serialize, i.e. export the document contents to a binary string. |         """Serialize, i.e. export the document contents to a binary string. | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import os | import os | ||||||
| import ujson | import ujson | ||||||
| import pip | import pkg_resources | ||||||
| import importlib | import importlib | ||||||
| import regex as re | import regex as re | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -14,6 +14,7 @@ import numpy | ||||||
| import io | import io | ||||||
| import dill | import dill | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
|  | from thinc.neural._classes.model import Model | ||||||
| 
 | 
 | ||||||
| import msgpack | import msgpack | ||||||
| import msgpack_numpy | import msgpack_numpy | ||||||
|  | @ -180,9 +181,10 @@ def is_package(name): | ||||||
|     name (unicode): Name of package. |     name (unicode): Name of package. | ||||||
|     RETURNS (bool): True if installed package, False if not. |     RETURNS (bool): True if installed package, False if not. | ||||||
|     """ |     """ | ||||||
|     packages = pip.get_installed_distributions() |     name = name.lower()  # compare package name against lowercase name | ||||||
|  |     packages = pkg_resources.working_set.by_key.keys() | ||||||
|     for package in packages: |     for package in packages: | ||||||
|         if package.project_name.replace('-', '_') == name: |         if package.lower().replace('-', '_') == name: | ||||||
|             return True |             return True | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
|  | @ -193,6 +195,7 @@ def get_package_path(name): | ||||||
|     name (unicode): Package name. |     name (unicode): Package name. | ||||||
|     RETURNS (Path): Path to installed package. |     RETURNS (Path): Path to installed package. | ||||||
|     """ |     """ | ||||||
|  |     name = name.lower()  # use lowercase version to be safe | ||||||
|     # Here we're importing the module just to find it. This is worryingly |     # Here we're importing the module just to find it. This is worryingly | ||||||
|     # indirect, but it's otherwise very difficult to find the package. |     # indirect, but it's otherwise very difficult to find the package. | ||||||
|     pkg = importlib.import_module(name) |     pkg = importlib.import_module(name) | ||||||
|  | @ -557,3 +560,17 @@ def minify_html(html): | ||||||
|     RETURNS (unicode): "Minified" HTML. |     RETURNS (unicode): "Minified" HTML. | ||||||
|     """ |     """ | ||||||
|     return html.strip().replace('    ', '').replace('\n', '') |     return html.strip().replace('    ', '').replace('\n', '') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def use_gpu(gpu_id): | ||||||
|  |     try: | ||||||
|  |         import cupy.cuda.device | ||||||
|  |     except ImportError: | ||||||
|  |         return None | ||||||
|  |     from thinc.neural.ops import CupyOps | ||||||
|  |     device = cupy.cuda.device.Device(gpu_id) | ||||||
|  |     device.use() | ||||||
|  |     Model.ops = CupyOps() | ||||||
|  |     Model.Ops = CupyOps | ||||||
|  |     return device | ||||||
|  | 
 | ||||||
|  |  | ||||||
|  | @ -6,6 +6,8 @@ import msgpack | ||||||
| import msgpack_numpy | import msgpack_numpy | ||||||
| msgpack_numpy.patch() | msgpack_numpy.patch() | ||||||
| cimport numpy as np | cimport numpy as np | ||||||
|  | from thinc.neural.util import get_array_module | ||||||
|  | from thinc.neural._classes.model import Model | ||||||
| 
 | 
 | ||||||
| from .typedefs cimport attr_t | from .typedefs cimport attr_t | ||||||
| from .strings cimport StringStore | from .strings cimport StringStore | ||||||
|  | @ -14,15 +16,29 @@ from .compat import basestring_ | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Vectors: | cdef class Vectors: | ||||||
|     '''Store, save and load word vectors.''' |     '''Store, save and load word vectors. | ||||||
|  | 
 | ||||||
|  |     Vectors data is kept in the vectors.data attribute, which should be an | ||||||
|  |     instance of numpy.ndarray (for CPU vectors) | ||||||
|  |     or cupy.ndarray (for GPU vectors). | ||||||
|  | 
 | ||||||
|  |     vectors.key2row is a dictionary mapping word hashes to rows | ||||||
|  |     in the vectors.data table. The array `vectors.keys` keeps | ||||||
|  |     the keys in order, such that keys[vectors.key2row[key]] == key. | ||||||
|  |     ''' | ||||||
|     cdef public object data |     cdef public object data | ||||||
|     cdef readonly StringStore strings |     cdef readonly StringStore strings | ||||||
|     cdef public object key2row |     cdef public object key2row | ||||||
|     cdef public object keys |     cdef public object keys | ||||||
|     cdef public int i |     cdef public int i | ||||||
| 
 | 
 | ||||||
|     def __init__(self, strings, data_or_width): |     def __init__(self, strings, data_or_width=0): | ||||||
|         self.strings = StringStore() |         if isinstance(strings, StringStore): | ||||||
|  |             self.strings = strings | ||||||
|  |         else: | ||||||
|  |             self.strings = StringStore() | ||||||
|  |             for string in strings: | ||||||
|  |                 self.strings.add(string) | ||||||
|         if isinstance(data_or_width, int): |         if isinstance(data_or_width, int): | ||||||
|             self.data = data = numpy.zeros((len(strings), data_or_width), |             self.data = data = numpy.zeros((len(strings), data_or_width), | ||||||
|                                            dtype='f') |                                            dtype='f') | ||||||
|  | @ -37,6 +53,11 @@ cdef class Vectors: | ||||||
|         return (Vectors, (self.strings, self.data)) |         return (Vectors, (self.strings, self.data)) | ||||||
| 
 | 
 | ||||||
|     def __getitem__(self, key): |     def __getitem__(self, key): | ||||||
|  |         '''Get a vector by key. If key is a string, it is hashed | ||||||
|  |         to an integer ID using the vectors.strings table. | ||||||
|  | 
 | ||||||
|  |         If the integer key is not found in the table, a KeyError is raised. | ||||||
|  |         ''' | ||||||
|         if isinstance(key, basestring): |         if isinstance(key, basestring): | ||||||
|             key = self.strings[key] |             key = self.strings[key] | ||||||
|         i = self.key2row[key] |         i = self.key2row[key] | ||||||
|  | @ -46,23 +67,30 @@ cdef class Vectors: | ||||||
|             return self.data[i] |             return self.data[i] | ||||||
| 
 | 
 | ||||||
|     def __setitem__(self, key, vector): |     def __setitem__(self, key, vector): | ||||||
|  |         '''Set a vector for the given key. If key is a string, it is hashed | ||||||
|  |         to an integer ID using the vectors.strings table. | ||||||
|  |         ''' | ||||||
|         if isinstance(key, basestring): |         if isinstance(key, basestring): | ||||||
|             key = self.strings.add(key) |             key = self.strings.add(key) | ||||||
|         i = self.key2row[key] |         i = self.key2row[key] | ||||||
|         self.data[i] = vector |         self.data[i] = vector | ||||||
| 
 | 
 | ||||||
|     def __iter__(self): |     def __iter__(self): | ||||||
|  |         '''Yield vectors from the table.''' | ||||||
|         yield from self.data |         yield from self.data | ||||||
| 
 | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|  |         '''Return the number of vectors that have been assigned.''' | ||||||
|         return self.i |         return self.i | ||||||
| 
 | 
 | ||||||
|     def __contains__(self, key): |     def __contains__(self, key): | ||||||
|  |         '''Check whether a key has a vector entry in the table.''' | ||||||
|         if isinstance(key, basestring_): |         if isinstance(key, basestring_): | ||||||
|             key = self.strings[key] |             key = self.strings[key] | ||||||
|         return key in self.key2row |         return key in self.key2row | ||||||
| 
 | 
 | ||||||
|     def add(self, key, vector=None): |     def add(self, key, vector=None): | ||||||
|  |         '''Add a key to the table, optionally setting a vector value as well.''' | ||||||
|         if isinstance(key, basestring_): |         if isinstance(key, basestring_): | ||||||
|             key = self.strings.add(key) |             key = self.strings.add(key) | ||||||
|         if key not in self.key2row: |         if key not in self.key2row: | ||||||
|  | @ -80,7 +108,9 @@ cdef class Vectors: | ||||||
|         return i |         return i | ||||||
| 
 | 
 | ||||||
|     def items(self): |     def items(self): | ||||||
|         for i, string in enumerate(self.strings): |         '''Iterate over (string key, vector) pairs, in order.''' | ||||||
|  |         for i, key in enumerate(self.keys): | ||||||
|  |             string = self.strings[key] | ||||||
|             yield string, self.data[i] |             yield string, self.data[i] | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  | @ -118,9 +148,14 @@ cdef class Vectors: | ||||||
|             self.data |             self.data | ||||||
| 
 | 
 | ||||||
|     def to_disk(self, path, **exclude): |     def to_disk(self, path, **exclude): | ||||||
|  |         xp = get_array_module(self.data) | ||||||
|  |         if xp is numpy: | ||||||
|  |             save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) | ||||||
|  |         else: | ||||||
|  |             save_array = lambda arr, file_: xp.save(file_, arr) | ||||||
|         serializers = OrderedDict(( |         serializers = OrderedDict(( | ||||||
|             ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), |             ('vectors', lambda p: save_array(self.data, p.open('wb'))), | ||||||
|             ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), |             ('keys', lambda p: xp.save(p.open('wb'), self.keys)) | ||||||
|         )) |         )) | ||||||
|         return util.to_disk(path, serializers, exclude) |         return util.to_disk(path, serializers, exclude) | ||||||
| 
 | 
 | ||||||
|  | @ -133,8 +168,9 @@ cdef class Vectors: | ||||||
|                     self.key2row[key] = i |                     self.key2row[key] = i | ||||||
| 
 | 
 | ||||||
|         def load_vectors(path): |         def load_vectors(path): | ||||||
|  |             xp = Model.ops.xp | ||||||
|             if path.exists(): |             if path.exists(): | ||||||
|                 self.data = numpy.load(path) |                 self.data = xp.load(path) | ||||||
| 
 | 
 | ||||||
|         serializers = OrderedDict(( |         serializers = OrderedDict(( | ||||||
|             ('keys', load_keys), |             ('keys', load_keys), | ||||||
|  |  | ||||||
|  | @ -27,6 +27,7 @@ from .vectors import Vectors | ||||||
| from . import util | from . import util | ||||||
| from . import attrs | from . import attrs | ||||||
| from . import symbols | from . import symbols | ||||||
|  | from ._ml import link_vectors_to_models | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Vocab: | cdef class Vocab: | ||||||
|  | @ -65,7 +66,7 @@ cdef class Vocab: | ||||||
|                 self.strings.add(name) |                 self.strings.add(name) | ||||||
|         self.lex_attr_getters = lex_attr_getters |         self.lex_attr_getters = lex_attr_getters | ||||||
|         self.morphology = Morphology(self.strings, tag_map, lemmatizer) |         self.morphology = Morphology(self.strings, tag_map, lemmatizer) | ||||||
|         self.vectors = Vectors(self.strings, 300) |         self.vectors = Vectors(self.strings) | ||||||
| 
 | 
 | ||||||
|     property lang: |     property lang: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -261,7 +262,7 @@ cdef class Vocab: | ||||||
|         Words can be looked up by string or int ID. |         Words can be looked up by string or int ID. | ||||||
| 
 | 
 | ||||||
|         RETURNS: |         RETURNS: | ||||||
|             A word vector. Size and shape determed by the |             A word vector. Size and shape determined by the | ||||||
|             vocab.vectors instance. Usually, a numpy ndarray |             vocab.vectors instance. Usually, a numpy ndarray | ||||||
|             of shape (300,) and dtype float32. |             of shape (300,) and dtype float32. | ||||||
| 
 | 
 | ||||||
|  | @ -323,6 +324,7 @@ cdef class Vocab: | ||||||
|             self.lexemes_from_bytes(file_.read()) |             self.lexemes_from_bytes(file_.read()) | ||||||
|         if self.vectors is not None: |         if self.vectors is not None: | ||||||
|             self.vectors.from_disk(path, exclude='strings.json') |             self.vectors.from_disk(path, exclude='strings.json') | ||||||
|  |         link_vectors_to_models(self) | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self, **exclude): |     def to_bytes(self, **exclude): | ||||||
|  | @ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir, | ||||||
|     vocab.lex_attr_getters = lex_attr_getters |     vocab.lex_attr_getters = lex_attr_getters | ||||||
|     vocab.lexemes_from_bytes(lexemes_data) |     vocab.lexemes_from_bytes(lexemes_data) | ||||||
|     vocab.length = length |     vocab.length = length | ||||||
|  |     link_vectors_to_models(vocab) | ||||||
|     return vocab |     return vocab | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -17,6 +17,7 @@ fi | ||||||
| 
 | 
 | ||||||
| if [ "${VIA}" == "compile" ]; then | if [ "${VIA}" == "compile" ]; then | ||||||
|   pip install -r requirements.txt |   pip install -r requirements.txt | ||||||
|  |   python setup.py build_ext --inplace | ||||||
|   pip install -e . |   pip install -e . | ||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -8,4 +8,5 @@ include _includes/_mixins | ||||||
|         | does not exist! |         | does not exist! | ||||||
| 
 | 
 | ||||||
|     h2.c-landing__title.u-heading-3.u-padding-small |     h2.c-landing__title.u-heading-3.u-padding-small | ||||||
|         a(href="javascript:history.go(-1)") Click here to go back. |         +button(false, true, "secondary-light")(href="javascript:history.go(-1)") | ||||||
|  |             |  Click here to go back | ||||||
|  |  | ||||||
|  | @ -3,24 +3,22 @@ | ||||||
|         "landing": true, |         "landing": true, | ||||||
|         "logos": [ |         "logos": [ | ||||||
|             { |             { | ||||||
|                 "quora": [ "https://www.quora.com", 150 ], |                 "airbnb": [ "https://www.airbnb.com", 150, 45], | ||||||
|                 "chartbeat": [ "https://chartbeat.com", 200 ], |                 "quora": [ "https://www.quora.com", 120, 34 ], | ||||||
|                 "duedil": [ "https://www.duedil.com", 150 ], |                 "retriever": [ "https://www.retriever.no", 150, 33 ], | ||||||
|                 "stitchfix": [ "https://www.stitchfix.com", 190 ] |                 "stitchfix": [ "https://www.stitchfix.com", 150, 18 ] | ||||||
|             }, |             }, | ||||||
|             { |             { | ||||||
|                 "wayblazer": [ "http://wayblazer.com", 200 ], |                 "chartbeat": [ "https://chartbeat.com", 180, 25 ], | ||||||
|                 "indico": [ "https://indico.io", 150 ], |                 "allenai": [ "https://allenai.org", 220, 37 ] | ||||||
|                 "chattermill": [ "https://chattermill.io", 175 ], |             } | ||||||
|                 "turi": [ "https://turi.com", 150 ], |         ], | ||||||
|                 "kip": [ "http://kipthis.com", 70 ] |         "features": [ | ||||||
|             }, |  | ||||||
|             { |             { | ||||||
|                 "socrata": [ "https://www.socrata.com", 150 ], |                 "thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28], | ||||||
|                 "cytora": [ "http://www.cytora.com", 125 ], |                 "wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77], | ||||||
|                 "signaln": [ "http://signaln.com", 150 ], |                 "venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19], | ||||||
|                 "wonderflow": [ "http://www.wonderflow.co", 200 ], |                 "microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28] | ||||||
|                 "synapsify": [ "http://www.gosynapsify.com", 150 ] |  | ||||||
|             } |             } | ||||||
|         ] |         ] | ||||||
|     }, |     }, | ||||||
|  | @ -34,7 +32,24 @@ | ||||||
|         "landing": true |         "landing": true | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "announcement" : { |     "styleguide": { | ||||||
|         "title": "Important Announcement" |         "title": "Styleguide", | ||||||
|  |         "sidebar": { | ||||||
|  |             "Styleguide": { "": "styleguide" }, | ||||||
|  |             "Resources": { | ||||||
|  |                 "Website Source": "https://github.com/explosion/spacy/tree/master/website", | ||||||
|  |                 "Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md" | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |         "menu": { | ||||||
|  |             "Introduction": "intro", | ||||||
|  |             "Logo": "logo", | ||||||
|  |             "Colors": "colors", | ||||||
|  |             "Typography": "typography", | ||||||
|  |             "Elements": "elements", | ||||||
|  |             "Components": "components", | ||||||
|  |             "Embeds": "embeds", | ||||||
|  |             "Markup Reference": "markup" | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -11,12 +11,9 @@ | ||||||
|         "COMPANY": "Explosion AI", |         "COMPANY": "Explosion AI", | ||||||
|         "COMPANY_URL": "https://explosion.ai", |         "COMPANY_URL": "https://explosion.ai", | ||||||
|         "DEMOS_URL": "https://demos.explosion.ai", |         "DEMOS_URL": "https://demos.explosion.ai", | ||||||
|  |         "MODELS_REPO": "explosion/spacy-models", | ||||||
| 
 | 
 | ||||||
|         "SPACY_VERSION": "1.8", |         "SPACY_VERSION": "2.0", | ||||||
|         "LATEST_NEWS": { |  | ||||||
|             "url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha", |  | ||||||
|             "title": "Test spaCy v2.0.0 alpha!" |  | ||||||
|         }, |  | ||||||
| 
 | 
 | ||||||
|         "SOCIAL": { |         "SOCIAL": { | ||||||
|             "twitter": "spacy_io", |             "twitter": "spacy_io", | ||||||
|  | @ -27,25 +24,23 @@ | ||||||
|         }, |         }, | ||||||
| 
 | 
 | ||||||
|         "NAVIGATION": { |         "NAVIGATION": { | ||||||
|             "Home": "/", |             "Usage": "/usage", | ||||||
|             "Usage": "/docs/usage", |             "Models": "/models", | ||||||
|             "Reference": "/docs/api", |             "API": "/api" | ||||||
|             "Demos": "/docs/usage/showcase", |  | ||||||
|             "Blog": "https://explosion.ai/blog" |  | ||||||
|         }, |         }, | ||||||
| 
 | 
 | ||||||
|         "FOOTER": { |         "FOOTER": { | ||||||
|             "spaCy": { |             "spaCy": { | ||||||
|                 "Usage": "/docs/usage", |                 "Usage": "/usage", | ||||||
|                 "API Reference": "/docs/api", |                 "Models": "/models", | ||||||
|                 "Tutorials": "/docs/usage/tutorials", |                 "API Reference": "/api", | ||||||
|                 "Showcase": "/docs/usage/showcase" |                 "Resources": "/usage/resources" | ||||||
|             }, |             }, | ||||||
|             "Support": { |             "Support": { | ||||||
|                 "Issue Tracker": "https://github.com/explosion/spaCy/issues", |                 "Issue Tracker": "https://github.com/explosion/spaCy/issues", | ||||||
|                 "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy", |                 "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy", | ||||||
|                 "Reddit usergroup": "https://www.reddit.com/r/spacynlp/", |                 "Reddit Usergroup": "https://www.reddit.com/r/spacynlp/", | ||||||
|                 "Gitter chat": "https://gitter.im/explosion/spaCy" |                 "Gitter Chat": "https://gitter.im/explosion/spaCy" | ||||||
|             }, |             }, | ||||||
|             "Connect": { |             "Connect": { | ||||||
|                 "Twitter": "https://twitter.com/spacy_io", |                 "Twitter": "https://twitter.com/spacy_io", | ||||||
|  | @ -74,21 +69,11 @@ | ||||||
|                 {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, |                 {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, | ||||||
|                 {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] |                 {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] | ||||||
|             }, |             }, | ||||||
|             { "id": "model", "title": "Models", "multiple": true, "options": [ |             { "id": "model", "title": "Models", "multiple": true } | ||||||
|                 { "id": "en", "title": "English", "meta": "50MB" }, |  | ||||||
|                 { "id": "de", "title": "German", "meta": "645MB" }, |  | ||||||
|                 { "id": "fr", "title": "French", "meta": "1.33GB" }, |  | ||||||
|                 { "id": "es", "title": "Spanish", "meta": "377MB"}] |  | ||||||
|             } |  | ||||||
|         ], |         ], | ||||||
| 
 | 
 | ||||||
|         "QUICKSTART_MODELS": [ |         "QUICKSTART_MODELS": [ | ||||||
|             { "id": "lang", "title": "Language", "options": [ |             { "id": "lang", "title": "Language"}, | ||||||
|                 { "id": "en", "title": "English", "checked": true }, |  | ||||||
|                 { "id": "de", "title": "German" }, |  | ||||||
|                 { "id": "fr", "title": "French" }, |  | ||||||
|                 { "id": "es", "title": "Spanish" }] |  | ||||||
|             }, |  | ||||||
|             { "id": "load", "title": "Loading style", "options": [ |             { "id": "load", "title": "Loading style", "options": [ | ||||||
|                 { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, |                 { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, | ||||||
|                 {  "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }] |                 {  "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }] | ||||||
|  | @ -98,50 +83,15 @@ | ||||||
|             } |             } | ||||||
|         ], |         ], | ||||||
| 
 | 
 | ||||||
|         "MODELS": { |  | ||||||
|             "en": [ |  | ||||||
|                 { "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true }, |  | ||||||
|                 { "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" }, |  | ||||||
|                 { "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" }, |  | ||||||
|                 { "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" } |  | ||||||
|             ], |  | ||||||
|             "de": [ |  | ||||||
|                 { "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" } |  | ||||||
|             ], |  | ||||||
|             "fr": [ |  | ||||||
|                 { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } |  | ||||||
|             ], |  | ||||||
|             "es": [ |  | ||||||
|                 { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"} |  | ||||||
|             ] |  | ||||||
|         }, |  | ||||||
| 
 |  | ||||||
|         "EXAMPLE_SENTENCES": { |  | ||||||
|             "en": "This is a sentence.", |  | ||||||
|             "de": "Dies ist ein Satz.", |  | ||||||
|             "fr": "C'est une phrase.", |  | ||||||
|             "es": "Esto es una frase." |  | ||||||
|         }, |  | ||||||
| 
 |  | ||||||
|         "ALPHA": true, |         "ALPHA": true, | ||||||
|         "V_CSS": "1.6", |         "V_CSS": "2.0", | ||||||
|         "V_JS": "1.2", |         "V_JS": "2.0", | ||||||
|         "DEFAULT_SYNTAX": "python", |         "DEFAULT_SYNTAX": "python", | ||||||
|         "ANALYTICS": "UA-58931649-1", |         "ANALYTICS": "UA-58931649-1", | ||||||
|         "MAILCHIMP": { |         "MAILCHIMP": { | ||||||
|             "user": "spacy.us12", |             "user": "spacy.us12", | ||||||
|             "id": "83b0498b1e7fa3c91ce68c3f1", |             "id": "83b0498b1e7fa3c91ce68c3f1", | ||||||
|             "list": "89ad33e698" |             "list": "89ad33e698" | ||||||
|         }, |  | ||||||
|         "BADGES": { |  | ||||||
|             "pipy": { |  | ||||||
|                 "badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square", |  | ||||||
|                 "link": "https://pypi.python.org/pypi/spacy" |  | ||||||
|             }, |  | ||||||
|             "conda": { |  | ||||||
|                 "badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg", |  | ||||||
|                 "link": "https://anaconda.org/conda-forge/spacy" |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -1,8 +1,6 @@ | ||||||
| //- 💫 INCLUDES > FOOTER | //- 💫 INCLUDES > FOOTER | ||||||
| 
 | 
 | ||||||
| include _mixins | footer.o-footer.u-text | ||||||
| 
 |  | ||||||
| footer.o-footer.u-text.u-border-dotted |  | ||||||
|     +grid.o-content |     +grid.o-content | ||||||
|         each group, label in FOOTER |         each group, label in FOOTER | ||||||
|             +grid-col("quarter") |             +grid-col("quarter") | ||||||
|  | @ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted | ||||||
|                         li |                         li | ||||||
|                             +a(url)=item |                             +a(url)=item | ||||||
| 
 | 
 | ||||||
|         if SECTION != "docs" |         if SECTION == "index" | ||||||
|             +grid-col("quarter") |             +grid-col("quarter") | ||||||
|                 include _newsletter |                 include _newsletter | ||||||
| 
 | 
 | ||||||
|     if SECTION == "docs" |     if SECTION != "index" | ||||||
|         .o-content.o-block.u-border-dotted |         .o-content.o-block.u-border-dotted | ||||||
|             include _newsletter |             include _newsletter | ||||||
| 
 | 
 | ||||||
|     .o-inline-list.u-text-center.u-text-tiny.u-color-subtle |     .o-inline-list.u-text-center.u-text-tiny.u-color-subtle | ||||||
|         span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] |         span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] | ||||||
| 
 | 
 | ||||||
|         +a(COMPANY_URL, true) |         +a(COMPANY_URL, true)(aria-label="Explosion AI") | ||||||
|             +svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale |             +icon("explosion", 45).o-icon.u-color-theme.u-grayscale | ||||||
| 
 | 
 | ||||||
|         +a(COMPANY_URL + "/legal", true) Legal / Imprint |         +a(COMPANY_URL + "/legal", true) Legal / Imprint | ||||||
|  |  | ||||||
|  | @ -1,35 +1,71 @@ | ||||||
| //- 💫 INCLUDES > FUNCTIONS | //- 💫 INCLUDES > FUNCTIONS | ||||||
| 
 | 
 | ||||||
| //- More descriptive variables for current.path and current.source | //- Descriptive variables, available in the global scope | ||||||
| 
 | 
 | ||||||
| - CURRENT = current.source | - CURRENT = current.source | ||||||
| - SECTION = current.path[0] | - SECTION = current.path[0] | ||||||
| - SUBSECTION = current.path[1] | - LANGUAGES = public.models._data.LANGUAGES | ||||||
|  | - MODELS = public.models._data.MODELS | ||||||
|  | - CURRENT_MODELS = MODELS[current.source] || [] | ||||||
|  | 
 | ||||||
|  | - MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b) | ||||||
|  | - MODEL_LANG_COUNT = Object.keys(MODELS).length | ||||||
|  | - LANG_COUNT = Object.keys(LANGUAGES).length | ||||||
|  | 
 | ||||||
|  | - MODEL_META = public.models._data.MODEL_META | ||||||
|  | - MODEL_LICENSES = public.models._data.MODEL_LICENSES | ||||||
|  | - MODEL_ACCURACY = public.models._data.MODEL_ACCURACY | ||||||
|  | - EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES | ||||||
|  | 
 | ||||||
|  | - IS_PAGE = (SECTION != "index") && !landing | ||||||
|  | - IS_MODELS = (SECTION == "models" && LANGUAGES[current.source]) | ||||||
|  | - HAS_MODELS = IS_MODELS && CURRENT_MODELS.length | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Add prefixes to items of an array (for modifier CSS classes) | //- Add prefixes to items of an array (for modifier CSS classes) | ||||||
|  |     array   - [array] list of class names or options, e.g. ["foot"] | ||||||
|  |     prefix  - [string] prefix to add to each class, e.g. "c-table__row" | ||||||
|  |     RETURNS - [array] list of modified class names | ||||||
| 
 | 
 | ||||||
| -   function prefixArgs(array, prefix) { | -   function prefixArgs(array, prefix) { | ||||||
| -       return array.map(function(arg) { | -       return array.map(arg => prefix + '--' + arg).join(' '); | ||||||
| -           return prefix + '--' + arg; | -   } | ||||||
| -       }).join(' '); | 
 | ||||||
|  | 
 | ||||||
|  | //- Convert API paths (semi-temporary fix for renamed sections) | ||||||
|  |     path    - [string] link path supplied to +api mixin | ||||||
|  |     RETURNS - [string] new link path to correct location | ||||||
|  | 
 | ||||||
|  | -   function convertAPIPath(path) { | ||||||
|  | -       if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) { | ||||||
|  | -           var comps = path.split('#'); | ||||||
|  | -           return "top-level#" + comps[0] + '.' + comps[1]; | ||||||
|  | -       } | ||||||
|  | -       else if (path.startsWith('cli#')) { | ||||||
|  | -           return "top-level#" + path.split('#')[1]; | ||||||
|  | -       } | ||||||
|  | -       return path; | ||||||
|  | -   } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | //- Get model components from ID. Components can then be looked up in LANGUAGES | ||||||
|  |     and MODEL_META respectively, to get their human-readable form. | ||||||
|  |     id      - [string] model ID, e.g. "en_core_web_sm" | ||||||
|  |     RETURNS - [object] object keyed by components lang, type, genre and size | ||||||
|  | 
 | ||||||
|  | -   function getModelComponents(id) { | ||||||
|  | -       var comps = id.split('_'); | ||||||
|  | -       return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]} | ||||||
| -   } | -   } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Generate GitHub links | //- Generate GitHub links | ||||||
|  |     repo     - [string] name of repo owned by explosion | ||||||
|  |     filepath - [string] logical path to file relative to repository root | ||||||
|  |     branch   - [string] optional branch, defaults to "master" | ||||||
|  |     RETURNS  - [string] the correct link to the file on GitHub | ||||||
| 
 | 
 | ||||||
| -   function gh(repo, filepath, branch) { | -   function gh(repo, filepath, branch) { | ||||||
| -       var branch = ALPHA ? 'develop' : branch | -       var branch = ALPHA ? 'develop' : branch | ||||||
| -       return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); | -       return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); | ||||||
| -   } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| //- Get social images |  | ||||||
| 
 |  | ||||||
| -   function getSocialImg() { |  | ||||||
| -       var base = SITE_URL + '/assets/img/social/preview_' |  | ||||||
| -       var image = ALPHA ? 'alpha' : 'default' |  | ||||||
| -       if (preview) image = preview |  | ||||||
| -       else if (SECTION == 'docs' && !ALPHA) image = 'docs' |  | ||||||
| -       return base + image + '.jpg' |  | ||||||
| -   } | -   } | ||||||
|  |  | ||||||
|  | @ -1,5 +1,13 @@ | ||||||
| //- 💫 MIXINS > BASE | //- 💫 MIXINS > BASE | ||||||
| 
 | 
 | ||||||
|  | //- Section | ||||||
|  |     id - [string] anchor assigned to section (used for breadcrumb navigation) | ||||||
|  | 
 | ||||||
|  | mixin section(id) | ||||||
|  |     section.o-section(id="section-" + id data-section=id) | ||||||
|  |         block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| //- Aside wrapper | //- Aside wrapper | ||||||
|     label - [string] aside label |     label - [string] aside label | ||||||
| 
 | 
 | ||||||
|  | @ -11,34 +19,26 @@ mixin aside-wrapper(label) | ||||||
| 
 | 
 | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
| //- Date |  | ||||||
|     input - [string] date in the format YYYY-MM-DD |  | ||||||
| 
 | 
 | ||||||
| mixin date(input) | //- SVG from map (uses embedded SVG sprite) | ||||||
|     - var date = new Date(input) |  | ||||||
|     - var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] |  | ||||||
| 
 |  | ||||||
|     time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear() |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| //- SVG from map |  | ||||||
|     file   - [string] SVG file name in /assets/img/ |  | ||||||
|     name   - [string] SVG symbol id |     name   - [string] SVG symbol id | ||||||
|     width  - [integer] width in px |     width  - [integer] width in px | ||||||
|     height - [integer] height in px (default: same as width) |     height - [integer] height in px (default: same as width) | ||||||
| 
 | 
 | ||||||
| mixin svg(file, name, width, height) | mixin svg(name, width, height) | ||||||
|     svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) |     svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) | ||||||
|         use(xlink:href="/assets/img/#{file}.svg##{name}") |         use(xlink:href="#svg_#{name}") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Icon | //- Icon | ||||||
|     name - [string] icon name, should be SVG symbol ID |     name   - [string] icon name (will be used as symbol id: #svg_{name}) | ||||||
|     size - [integer] icon width and height (default: 20) |     width  - [integer] icon width (default: 20) | ||||||
|  |     height - [integer] icon height (defaults to width) | ||||||
| 
 | 
 | ||||||
| mixin icon(name, size) | mixin icon(name, width, height) | ||||||
|     - var size = size || 20 |     - var width = width || 20 | ||||||
|     +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) |     - var height = height || width | ||||||
|  |     +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Pro/Con/Neutral icon | //- Pro/Con/Neutral icon | ||||||
|  | @ -46,8 +46,8 @@ mixin icon(name, size) | ||||||
|     size - [integer] icon size (optional) |     size - [integer] icon size (optional) | ||||||
| 
 | 
 | ||||||
| mixin procon(icon, size) | mixin procon(icon, size) | ||||||
|     - colors = { pro: "green", con: "red", neutral: "yellow" } |     - colors = { pro: "green", con: "red", neutral: "subtle" } | ||||||
|     +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) |     +icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Headlines Helper Mixin | //- Headlines Helper Mixin | ||||||
|  | @ -80,8 +80,7 @@ mixin headline(level) | ||||||
| 
 | 
 | ||||||
| mixin permalink(id) | mixin permalink(id) | ||||||
|     if id |     if id | ||||||
|         a.u-permalink(id=id href="##{id}") |         a.u-permalink(href="##{id}") | ||||||
|             +icon("anchor").u-permalink__icon |  | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
|     else |     else | ||||||
|  | @ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results) | ||||||
|                     .c-quickstart__fields |                     .c-quickstart__fields | ||||||
|                         for option in group.options |                         for option in group.options | ||||||
|                             input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) |                             input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) | ||||||
|                             label.c-quickstart__label(for="qs-#{option.id}")!=option.title |                             label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title | ||||||
|                                 if option.meta |                                 if option.meta | ||||||
|                                     |  #[span.c-quickstart__label__meta (#{option.meta})] |                                     |  #[span.c-quickstart__label__meta (#{option.meta})] | ||||||
|                                 if option.help |                                 if option.help | ||||||
|  | @ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results) | ||||||
|                 code.c-code-block__content.c-quickstart__code(data-qs-results="") |                 code.c-code-block__content.c-quickstart__code(data-qs-results="") | ||||||
|                     block |                     block | ||||||
| 
 | 
 | ||||||
|     .c-quickstart__info.u-text-tiny.o-block.u-text-right |  | ||||||
|         |  Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]! |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| //- Quickstart code item | //- Quickstart code item | ||||||
|     data [object] - Rendering conditions (keyed by option group ID, value: option) |     data  - [object] Rendering conditions (keyed by option group ID, value: option) | ||||||
|  |     style - [string] modifier ID for line style | ||||||
| 
 | 
 | ||||||
| mixin qs(data, style) | mixin qs(data, style) | ||||||
|     - args = {} |     - args = {} | ||||||
|  | @ -148,6 +145,13 @@ mixin terminal(label) | ||||||
|         +code.x-terminal__code |         +code.x-terminal__code | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
|  | //- Chart.js | ||||||
|  |     id - [string] chart ID, will be assigned as #chart_{id} | ||||||
|  | 
 | ||||||
|  | mixin chart(id) | ||||||
|  |     figure.o-block&attributes(attributes) | ||||||
|  |         canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%") | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| //- Gitter chat button and widget | //- Gitter chat button and widget | ||||||
|     button - [string] text shown on button |     button - [string] text shown on button | ||||||
|  | @ -156,26 +160,24 @@ mixin terminal(label) | ||||||
| mixin gitter(button, label) | mixin gitter(button, label) | ||||||
|     aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) |     aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) | ||||||
| 
 | 
 | ||||||
|     button.js-gitter-button.c-chat__button.u-text-small |     button.js-gitter-button.c-chat__button.u-text-tag | ||||||
|         +icon("chat").o-icon--inline |         +icon("chat", 16).o-icon--inline | ||||||
|         !=button |         !=button | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Badge | //- Badge | ||||||
|     name - [string] "pipy" or "conda" |     image - [string] path to badge image | ||||||
|  |     url   - [string] badge link | ||||||
| 
 | 
 | ||||||
| mixin badge(name) | mixin badge(image, url) | ||||||
|     - site = BADGES[name] |     +a(url).u-padding-small.u-hide-link&attributes(attributes) | ||||||
| 
 |         img.o-badge(src=image alt=url height="20") | ||||||
|     if site |  | ||||||
|         +a(site.link).u-padding-small |  | ||||||
|             img(src=site.badge alt="{name} version" height="20") |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Logo | //- spaCy logo | ||||||
| 
 | 
 | ||||||
| mixin logo() | mixin logo() | ||||||
|     +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) |     +svg("spacy", 675, 215).o-logo&attributes(attributes) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Landing | //- Landing | ||||||
|  | @ -186,18 +188,56 @@ mixin landing-header() | ||||||
|             .c-landing__content |             .c-landing__content | ||||||
|                 block |                 block | ||||||
| 
 | 
 | ||||||
|  | mixin landing-banner(headline, label) | ||||||
|  |     .c-landing__banner.u-padding.o-block.u-color-light | ||||||
|  |         +grid.c-landing__banner__content.o-no-block | ||||||
|  |             +grid-col("third") | ||||||
|  |                 h3.u-heading.u-heading-1 | ||||||
|  |                     if label | ||||||
|  |                         div | ||||||
|  |                             span.u-text-label.u-text-label--light=label | ||||||
|  |                     !=headline | ||||||
| 
 | 
 | ||||||
| mixin landing-badge(url, graphic, alt, size) |             +grid-col("two-thirds").c-landing__banner__text | ||||||
|     +a(url)(aria-label=alt title=alt).c-landing__badge |                 block | ||||||
|         +svg("graphics", graphic, size || 225) | 
 | ||||||
|  | 
 | ||||||
|  | mixin landing-logos(title, logos) | ||||||
|  |     .o-content.u-text-center&attributes(attributes) | ||||||
|  |         h3.u-heading.u-text-label.u-color-dark=title | ||||||
|  | 
 | ||||||
|  |         each row, i in logos | ||||||
|  |             - var is_last = i == logos.length - 1 | ||||||
|  |             +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null) | ||||||
|  |                 each details, name in row | ||||||
|  |                     +a(details[0]).u-padding-medium | ||||||
|  |                         +icon(name, details[1], details[2]) | ||||||
|  | 
 | ||||||
|  |                 if is_last | ||||||
|  |                     block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Under construction (temporary) | //- Under construction (temporary) | ||||||
|     Marks sections that still need to be completed for the v2.0 release. |     Marks sections that still need to be completed for the v2.0 release. | ||||||
| 
 | 
 | ||||||
| mixin under-construction() | mixin under-construction() | ||||||
|     +infobox("🚧 Under construction") |     +infobox("Under construction", "🚧") | ||||||
|         |  This section is still being written and will be updated for the v2.0 |         |  This section is still being written and will be updated for the v2.0 | ||||||
|         |  release. Is there anything that you think should definitely mentioned or |         |  release. Is there anything that you think should definitely mentioned or | ||||||
|         |  explained here? Any examples you'd like to see? #[strong Let us know] |         |  explained here? Any examples you'd like to see? #[strong Let us know] | ||||||
|         |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! |         |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | //- Alpha infobox (temporary) | ||||||
|  |     Added in the templates to notify user that they're visiting the alpha site. | ||||||
|  | 
 | ||||||
|  | mixin alpha-info() | ||||||
|  |     +infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️") | ||||||
|  |         strong This page is part of the alpha documentation for spaCy v2.0. | ||||||
|  |         |  It does not reflect the state of the latest stable release. | ||||||
|  |         |  Because v2.0 is still under development, the implementation | ||||||
|  |         |  may differ from the intended state described here. See the | ||||||
|  |         |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes] | ||||||
|  |         |  for details on how to install and test the new version. To | ||||||
|  |         |  read the official docs for spaCy v1.x, | ||||||
|  |         |  #[+a("https://spacy.io/docs") go here]. | ||||||
|  |  | ||||||
|  | @ -8,11 +8,15 @@ include _mixins-base | ||||||
|     level - [integer] headline level, corresponds to h1, h2, h3 etc. |     level - [integer] headline level, corresponds to h1, h2, h3 etc. | ||||||
|     id    - [string] unique identifier, creates permalink (optional) |     id    - [string] unique identifier, creates permalink (optional) | ||||||
| 
 | 
 | ||||||
| mixin h(level, id) | mixin h(level, id, source) | ||||||
|     +headline(level).u-heading&attributes(attributes) |     +headline(level).u-heading(id=id)&attributes(attributes) | ||||||
|         +permalink(id) |         +permalink(id) | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
|  |         if source | ||||||
|  |             +button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right | ||||||
|  |                 span Source #[+icon("code", 14).o-icon--inline] | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| //- External links | //- External links | ||||||
|     url     - [string] link href |     url     - [string] link href | ||||||
|  | @ -38,21 +42,23 @@ mixin src(url) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- API link (with added tag and automatically generated path) | //- API link (with added tag and automatically generated path) | ||||||
|     path - [string] path to API docs page relative to /docs/api/ |     path - [string] path to API docs page relative to /api/ | ||||||
| 
 | 
 | ||||||
| mixin api(path) | mixin api(path) | ||||||
|     +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap |     - path = convertAPIPath(path) | ||||||
|  |     +a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
|         |  #[+icon("book", 18).o-icon--inline.u-color-theme] |         |  #[+icon("book", 16).o-icon--inline.u-color-theme] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Help icon with tooltip | //- Help icon with tooltip | ||||||
|     tooltip - [string] Tooltip text |     tooltip   - [string] Tooltip text | ||||||
|  |     icon_size - [integer] Optional size of help icon in px. | ||||||
| 
 | 
 | ||||||
| mixin help(tooltip) | mixin help(tooltip, icon_size) | ||||||
|     span(data-tooltip=tooltip)&attributes(attributes) |     span(data-tooltip=tooltip)&attributes(attributes) | ||||||
|         +icon("help", 16).i-icon--inline |         +icon("help", icon_size || 16).o-icon--inline | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Aside for text | //- Aside for text | ||||||
|  | @ -68,24 +74,43 @@ mixin aside(label) | ||||||
|     label    - [string] aside title (optional or false for no label) |     label    - [string] aside title (optional or false for no label) | ||||||
|     language - [string] language for syntax highlighting (default: "python") |     language - [string] language for syntax highlighting (default: "python") | ||||||
|                supports basic relevant languages available for PrismJS |                supports basic relevant languages available for PrismJS | ||||||
|  |     prompt   - [string] prompt displayed before first line, e.g. "$" | ||||||
| 
 | 
 | ||||||
| mixin aside-code(label, language) | mixin aside-code(label, language, prompt) | ||||||
|     +aside-wrapper(label) |     +aside-wrapper(label) | ||||||
|         +code(false, language).o-no-block |         +code(false, language, prompt).o-no-block | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Infobox | //- Infobox | ||||||
|     label - [string] infobox title (optional or false for no title) |     label - [string] infobox title (optional or false for no title) | ||||||
|  |     emoji - [string] optional emoji displayed before the title, necessary as | ||||||
|  |             argument to be able to wrap it for spacing | ||||||
| 
 | 
 | ||||||
| mixin infobox(label) | mixin infobox(label, emoji) | ||||||
|     aside.o-box.o-block.u-text-small |     aside.o-box.o-block.u-text-small | ||||||
|         if label |         if label | ||||||
|             h3.u-text-label.u-color-theme=label |             h3.u-heading.u-text-label.u-color-theme | ||||||
|  |                 if emoji | ||||||
|  |                     span.o-emoji=emoji | ||||||
|  |                 |  #{label} | ||||||
| 
 | 
 | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | //- Logos displayed in the top corner of some infoboxes | ||||||
|  |     logos - [array] List of icon ID, width, height and link. | ||||||
|  | 
 | ||||||
|  | mixin infobox-logos(...logos) | ||||||
|  |     .o-box__logos.u-text-right.u-float-right | ||||||
|  |         for logo in logos | ||||||
|  |             if logo[3] | ||||||
|  |                 |  #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]] | ||||||
|  |             else | ||||||
|  |                 |  #[+icon(logo[0], logo[1], logo[2]).u-color-dark] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| //- Link button | //- Link button | ||||||
|     url      - [string] link href |     url      - [string] link href | ||||||
|     trusted  - [boolean] if not set / false, rel="noopener nofollow" is added |     trusted  - [boolean] if not set / false, rel="noopener nofollow" is added | ||||||
|  | @ -94,7 +119,7 @@ mixin infobox(label) | ||||||
|                see assets/css/_components/_buttons.sass |                see assets/css/_components/_buttons.sass | ||||||
| 
 | 
 | ||||||
| mixin button(url, trusted, ...style) | mixin button(url, trusted, ...style) | ||||||
|     - external = url.includes("http") |     - external = url && url.includes("http") | ||||||
|     a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes) |     a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes) | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
|  | @ -103,31 +128,33 @@ mixin button(url, trusted, ...style) | ||||||
|     label    - [string] aside title (optional or false for no label) |     label    - [string] aside title (optional or false for no label) | ||||||
|     language - [string] language for syntax highlighting (default: "python") |     language - [string] language for syntax highlighting (default: "python") | ||||||
|                supports basic relevant languages available for PrismJS |                supports basic relevant languages available for PrismJS | ||||||
|     prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new) |     prompt   - [string] prompt displayed before first line, e.g. "$" | ||||||
|     height   - [integer] optional height to clip code block to |     height   - [integer] optional height to clip code block to | ||||||
|  |     icon     - [string] icon displayed next to code block (e.g. "accept" for new code) | ||||||
|  |     wrap     - [boolean] wrap text and disable horizontal scrolling | ||||||
| 
 | 
 | ||||||
| mixin code(label, language, prompt, height) | mixin code(label, language, prompt, height, icon, wrap) | ||||||
|     pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) |     pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) | ||||||
|         if label |         if label | ||||||
|             h4.u-text-label.u-text-label--dark=label |             h4.u-text-label.u-text-label--dark=label | ||||||
|         - var icon = (prompt == 'accept' || prompt == 'reject') |         - var icon = icon || (prompt == 'accept' || prompt == 'reject') | ||||||
|         if icon |         if icon | ||||||
|             - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} |             - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} | ||||||
|             .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) |             .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) | ||||||
|                 +icon(icon, 18) |                 +icon(icon, 18) | ||||||
| 
 | 
 | ||||||
|         code.c-code-block__content(data-prompt=icon ? null : prompt) |         code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt) | ||||||
|             block |             block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Code blocks to display old/new versions | //- Code blocks to display old/new versions | ||||||
| 
 | 
 | ||||||
| mixin code-old() | mixin code-old() | ||||||
|     +code(false, false, "reject").o-block-small |     +code(false, false, false, false, "reject").o-block-small | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| mixin code-new() | mixin code-new() | ||||||
|     +code(false, false, "accept").o-block-small |     +code(false, false, false, false, "accept").o-block-small | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -138,12 +165,33 @@ mixin code-new() | ||||||
| 
 | 
 | ||||||
| mixin codepen(slug, height, default_tab) | mixin codepen(slug, height, default_tab) | ||||||
|     figure.o-block(style="min-height: #{height}px")&attributes(attributes) |     figure.o-block(style="min-height: #{height}px")&attributes(attributes) | ||||||
|         .codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) |         .codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) | ||||||
|             +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen |             +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen | ||||||
| 
 | 
 | ||||||
|         script(async src="https://assets.codepen.io/assets/embed/ei.js") |         script(async src="https://assets.codepen.io/assets/embed/ei.js") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | //- GitHub embed | ||||||
|  |     repo     - [string] repository owned by explosion organization | ||||||
|  |     file     - [string] logical path to file, relative to repository root | ||||||
|  |     alt_file - [string] alternative file path used in footer and link button | ||||||
|  |     height   - [integer] height of code preview in px | ||||||
|  | 
 | ||||||
|  | mixin github(repo, file, alt_file, height) | ||||||
|  |     - var branch = ALPHA ? "develop" : "master" | ||||||
|  |     - var height = height || 250 | ||||||
|  | 
 | ||||||
|  |     figure.o-block | ||||||
|  |         pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px") | ||||||
|  |             code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}") | ||||||
|  | 
 | ||||||
|  |         footer.o-grid.u-text | ||||||
|  |             .o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)] | ||||||
|  |             div | ||||||
|  |                 +button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| //- Images / figures | //- Images / figures | ||||||
|     url     - [string] url or path to image |     url     - [string] url or path to image | ||||||
|     width   - [integer] image width in px, for better rendering (default: 500) |     width   - [integer] image width in px, for better rendering (default: 500) | ||||||
|  | @ -168,10 +216,26 @@ mixin image-caption() | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Label | //- Graphic or illustration with button | ||||||
|  |     original - [string] Path to original image | ||||||
|  | 
 | ||||||
|  | mixin graphic(original) | ||||||
|  |     +image | ||||||
|  |         block | ||||||
|  |         if original | ||||||
|  |             .u-text-right | ||||||
|  |                 +button(original, false, "secondary", "small") View large graphic | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | //- Labels | ||||||
| 
 | 
 | ||||||
| mixin label() | mixin label() | ||||||
|     .u-text-label.u-color-subtle&attributes(attributes) |     .u-text-label.u-color-dark&attributes(attributes) | ||||||
|  |         block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin label-inline() | ||||||
|  |     strong.u-text-label.u-color-dark&attributes(attributes) | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -188,8 +252,10 @@ mixin tag() | ||||||
| mixin tag-model(...capabs) | mixin tag-model(...capabs) | ||||||
|     - var intro = "To use this functionality, spaCy needs a model to be installed" |     - var intro = "To use this functionality, spaCy needs a model to be installed" | ||||||
|     - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : "" |     - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : "" | ||||||
|     +tag Requires model | 
 | ||||||
|     +help(intro + ext + ".").u-color-theme |     span.u-nowrap | ||||||
|  |         +tag Needs model | ||||||
|  |         +help(intro + ext + ".").u-color-theme | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- "New" tag to label features new in a specific version | //- "New" tag to label features new in a specific version | ||||||
|  | @ -219,15 +285,9 @@ mixin list(type, start) | ||||||
| 
 | 
 | ||||||
| //- List item (only used within +list) | //- List item (only used within +list) | ||||||
| 
 | 
 | ||||||
| mixin item(procon) | mixin item() | ||||||
|     if procon |     li.c-list__item&attributes(attributes) | ||||||
|         li&attributes(attributes) |         block | ||||||
|             +procon(procon).c-list__icon |  | ||||||
|             block |  | ||||||
| 
 |  | ||||||
|     else |  | ||||||
|         li.c-list__item&attributes(attributes) |  | ||||||
|             block |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Table | //- Table | ||||||
|  | @ -237,9 +297,9 @@ mixin table(head) | ||||||
|     table.c-table.o-block&attributes(attributes) |     table.c-table.o-block&attributes(attributes) | ||||||
| 
 | 
 | ||||||
|         if head |         if head | ||||||
|             +row |             +row("head") | ||||||
|                 each column in head |                 each column in head | ||||||
|                     th.c-table__head-cell.u-text-label=column |                     +head-cell=column | ||||||
| 
 | 
 | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
|  | @ -251,10 +311,11 @@ mixin row(...style) | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Footer table row (only ued within +table) |  | ||||||
| 
 | 
 | ||||||
| mixin footrow() | //- Header table cell (only used within +row) | ||||||
|     tr.c-table__row.c-table__row--foot&attributes(attributes) | 
 | ||||||
|  | mixin head-cell() | ||||||
|  |     th.c-table__head-cell.u-text-label&attributes(attributes) | ||||||
|         block |         block | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -284,71 +345,58 @@ mixin grid-col(width) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Card (only used within +grid) | //- Card (only used within +grid) | ||||||
|     title     - [string] card title |     title  - [string] card title | ||||||
|     details   - [object] url, image, author, description, tags etc. |     url    - [string] link for card | ||||||
|                 (see /docs/usage/_data.json) |     author - [string] optional author, displayed as byline at the bottom | ||||||
|  |     icon   - [string] optional ID of icon displayed with card | ||||||
|  |     width  - [string] optional width of grid column, defaults to "half" | ||||||
| 
 | 
 | ||||||
| mixin card(title, details) | mixin card(title, url, author, icon, width) | ||||||
|     +grid-col("half").o-card.u-text&attributes(attributes) |     +grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes) | ||||||
|         if details.image |         +a(url) | ||||||
|             +a(details.url).o-block-small |             h4.u-heading.u-text-label | ||||||
|                 img(src=details.image alt=title width="300" role="presentation") |                 if icon | ||||||
| 
 |                     +icon(icon, 25).u-float-right | ||||||
|         if title |                 if title | ||||||
|             +a(details.url) |                     span.u-color-dark=title | ||||||
|                 +h(3)=title |             .o-block-small.u-text-small | ||||||
| 
 |                 block | ||||||
|                     if details.author |         if author | ||||||
|                         .u-text-small.u-color-subtle by #{details.author} |             .u-color-subtle.u-text-tiny by #{author} | ||||||
| 
 |  | ||||||
|         if details.description || details.tags |  | ||||||
|             ul |  | ||||||
|                 if details.description |  | ||||||
|                     li=details.description |  | ||||||
| 
 |  | ||||||
|                 if details.tags |  | ||||||
|                     li |  | ||||||
|                         each tag in details.tags |  | ||||||
|                             span.u-text-tag #{tag} |  | ||||||
|                             |   |  | ||||||
| 
 |  | ||||||
|         block |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Simpler card list item (only used within +list) | //- Table of contents, to be used with +item mixins for links | ||||||
|     title     - [string] card title |     col - [string] width of column (see +grid-col) | ||||||
|     details   - [object] url, image, author, description, tags etc. |  | ||||||
|                 (see /docs/usage/_data.json) |  | ||||||
| 
 | 
 | ||||||
| mixin card-item(title, details) | mixin table-of-contents(col) | ||||||
|     +item&attributes(attributes) |     +grid-col(col || "half") | ||||||
|         +a(details.url)=title |         +infobox | ||||||
| 
 |             +label.o-block-small Table of contents | ||||||
|         if details.description |             +list("numbers").u-text-small.o-no-block | ||||||
|             br |                 block | ||||||
|             span=details.description |  | ||||||
| 
 |  | ||||||
|         if details.author |  | ||||||
|             br |  | ||||||
|             span.u-text-small.u-color-subtle by #{details.author} |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Table row for models table | //- Bibliography | ||||||
|  |     id - [string] ID of bibliography component, for anchor links. Can be used if | ||||||
|  |          there's more than one bibliography on one page. | ||||||
| 
 | 
 | ||||||
| mixin model-row(name, lang, procon, size, license, default_model, divider) | mixin bibliography(id) | ||||||
|     - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } |     section(id=id || "bibliography") | ||||||
|  |         +infobox | ||||||
|  |             +label.o-block-small Bibliography | ||||||
|  |             +list("numbers").u-text-small.o-no-block | ||||||
|  |                 block | ||||||
| 
 | 
 | ||||||
|     +row(divider ? "divider": null) | 
 | ||||||
|         +cell #[code=name] | //- Footnote | ||||||
|             if default_model |     id      - [string / integer] ID of footnote. | ||||||
|                 |  #[span.u-color-theme(title="default model") #[+icon("star", 16)]] |     bib_id  - [string] ID of bibliography component, defaults to "bibliography". | ||||||
|         +cell=lang |     tooltip - [string] optional text displayed as tooltip | ||||||
|         each icon in procon | 
 | ||||||
|             +cell.u-text-center #[+procon(icon ? "pro" : "con")] | mixin fn(id, bib_id, tooltip) | ||||||
|         +cell.u-text-right=size |     sup.u-padding-small(id="bib" + id data-tooltip=tooltip) | ||||||
|         +cell |         span.u-text-tag | ||||||
|             if license in licenses |             +a("#" + (bib_id || "bibliography")).u-hide-link #{id} | ||||||
|                 +a(licenses[license])=license |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| //- Table rows for annotation specs | //- Table rows for annotation specs | ||||||
|  | @ -383,14 +431,3 @@ mixin annotation-row(annots, style) | ||||||
|             else |             else | ||||||
|                 +cell=cell |                 +cell=cell | ||||||
|         block |         block | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| //- Table of contents, to be used with +item mixins for links |  | ||||||
|     col - [string] width of column (see +grid-col) |  | ||||||
| 
 |  | ||||||
| mixin table-of-contents(col) |  | ||||||
|     +grid-col(col || "half") |  | ||||||
|         +infobox |  | ||||||
|             +label.o-block-small Table of contents |  | ||||||
|             +list("numbers").u-text-small.o-no-block |  | ||||||
|                 block |  | ||||||
|  |  | ||||||
|  | @ -1,19 +1,15 @@ | ||||||
| //- 💫 INCLUDES > TOP NAVIGATION | //- 💫 INCLUDES > TOP NAVIGATION | ||||||
| 
 | 
 | ||||||
| include _mixins |  | ||||||
| 
 |  | ||||||
| nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) | nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) | ||||||
|     a(href='/') #[+logo] |     a(href="/" aria-label=SITENAME) #[+logo] | ||||||
| 
 |  | ||||||
|     if SUBSECTION != "index" |  | ||||||
|         .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION |  | ||||||
| 
 | 
 | ||||||
|     ul.c-nav__menu |     ul.c-nav__menu | ||||||
|         - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION |         - var current_url = '/' + current.path[0] | ||||||
| 
 |         each url, item in NAVIGATION | ||||||
|         each url, item in NAV |             li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null) | ||||||
|             li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null) |  | ||||||
|                 +a(url)=item |                 +a(url)=item | ||||||
| 
 | 
 | ||||||
|         li.c-nav__menu__item |         li.c-nav__menu__item.u-hidden-xs | ||||||
|             +a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)] |             +a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)] | ||||||
|  | 
 | ||||||
|  |     progress.c-progress.js-progress(value="0" max="1") | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| //- 💫 INCLUDES > NEWSLETTER | //- 💫 INCLUDES > NEWSLETTER | ||||||
| 
 | 
 | ||||||
| ul.o-block | ul.o-block-small | ||||||
|     li.u-text-label.u-color-subtle Stay in the loop! |     li.u-text-label.u-color-subtle Stay in the loop! | ||||||
|     li Receive updates about new releases, tutorials and more. |     li Receive updates about new releases, tutorials and more. | ||||||
| 
 | 
 | ||||||
|  | @ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c | ||||||
|     div(style="position: absolute; left: -5000px;" aria-hidden="true") |     div(style="position: absolute; left: -5000px;" aria-hidden="true") | ||||||
|         input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="") |         input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="") | ||||||
| 
 | 
 | ||||||
|     .o-grid-col.u-border.u-padding-small |     .o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small | ||||||
|         input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email") |         input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email") | ||||||
| 
 |         button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up | ||||||
|         button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up |  | ||||||
|  |  | ||||||
|  | @ -1,47 +1,56 @@ | ||||||
| //- 💫 INCLUDES > DOCS PAGE TEMPLATE | //- 💫 INCLUDES > DOCS PAGE TEMPLATE | ||||||
| 
 | 
 | ||||||
| - sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER | - sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER | ||||||
| 
 | 
 | ||||||
| include _sidebar | include _sidebar | ||||||
| 
 | 
 | ||||||
| main.o-main.o-main--sidebar.o-main--aside | main.o-main.o-main--sidebar.o-main--aside | ||||||
|     article.o-content |     article.o-content | ||||||
|         +grid.o-no-block |         +grid.o-no-block | ||||||
|             +grid-col(source ? "two-thirds" : "full") |             +h(1).u-heading--title=title.replace("'", "’") | ||||||
|                 +h(1)=title |                 if tag | ||||||
|                     if tag |                     +tag=tag | ||||||
|                         +tag=tag |                 if tag_new | ||||||
|  |                     +tag-new(tag_new) | ||||||
|  | 
 | ||||||
|  |                 if teaser | ||||||
|  |                     .u-heading__teaser.u-text-small.u-color-dark=teaser | ||||||
|  |                 else if IS_MODELS | ||||||
|  |                     .u-heading__teaser.u-text-small.u-color-dark | ||||||
|  |                         |  Available statistical models for | ||||||
|  |                         |  #[code=current.source] (#{LANGUAGES[current.source]}). | ||||||
| 
 | 
 | ||||||
|             if source |             if source | ||||||
|                 +grid-col("third").u-text-right |                 .o-block.u-text-right | ||||||
|                     .o-inline-list |                     +button(gh("spacy", source), false, "secondary", "small").u-nowrap | ||||||
|                         +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] |                         |  Source #[+icon("code", 14)] | ||||||
| 
 | 
 | ||||||
|  |         //-if ALPHA | ||||||
|  |         //-    +alpha-info | ||||||
| 
 | 
 | ||||||
|         if ALPHA |         if IS_MODELS | ||||||
|             +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") |             include _page_models | ||||||
|                 strong This page is part of the alpha documentation for spaCy v2.0. |         else | ||||||
|                 |  It does not reflect the state of the latest stable release. |             !=yield | ||||||
|                 |  Because v2.0 is still under development, the implementation |  | ||||||
|                 |  may differ from the intended state described here. See the |  | ||||||
|                 |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes] |  | ||||||
|                 |  for details on how to install and test the new version. To |  | ||||||
|                 |  read the official docs for spaCy v1.x, |  | ||||||
|                 |  #[+a("https://spacy.io/docs") go here]. |  | ||||||
| 
 |  | ||||||
|         !=yield |  | ||||||
| 
 | 
 | ||||||
|     +grid.o-content.u-text |     +grid.o-content.u-text | ||||||
|         +grid-col("half") |         +grid-col("half") | ||||||
|             if next && public.docs[SUBSECTION]._data[next] |             if !IS_MODELS | ||||||
|                 - data = public.docs[SUBSECTION]._data[next] |  | ||||||
| 
 |  | ||||||
|                 .o-inline-list |                 .o-inline-list | ||||||
|                     span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title] |                     +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small") | ||||||
|  |                         |  #[span.o-icon Suggest edits] #[+icon("code", 14)] | ||||||
| 
 | 
 | ||||||
|         +grid-col("half").u-text-right |         +grid-col("half").u-text-right | ||||||
|             .o-inline-list |             if next && public[SECTION]._data[next] | ||||||
|                 +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] |                 - data = public[SECTION]._data[next] | ||||||
|  | 
 | ||||||
|  |                 +grid("vcenter") | ||||||
|  |                     +a(next).u-text-small.u-flex-full | ||||||
|  |                         h4.u-text-label.u-color-dark Read next | ||||||
|  |                         |  #{data.title} | ||||||
|  | 
 | ||||||
|  |                     +a(next).c-icon-button.c-icon-button--right(aria-hidden="true") | ||||||
|  |                         +icon("arrow-right", 24) | ||||||
| 
 | 
 | ||||||
|     +gitter("spaCy chat") |     +gitter("spaCy chat") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										77
									
								
								website/_includes/_page_models.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								website/_includes/_page_models.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | ||||||
|  | //- 💫 INCLUDES > MODELS PAGE TEMPLATE | ||||||
|  | 
 | ||||||
|  | for id in CURRENT_MODELS | ||||||
|  |     +section(id) | ||||||
|  |         +grid("vcenter").o-no-block(id=id) | ||||||
|  |             +grid-col("two-thirds") | ||||||
|  |                 +h(2) | ||||||
|  |                     +a("#" + id).u-permalink=id | ||||||
|  | 
 | ||||||
|  |             +grid-col("third").u-text-right | ||||||
|  |                 .u-color-subtle.u-text-tiny | ||||||
|  |                     +button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download") | ||||||
|  |                         |  Release details | ||||||
|  |                     .u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a] | ||||||
|  | 
 | ||||||
|  |         +aside-code("Installation", "bash", "$"). | ||||||
|  |             spacy download #{id} | ||||||
|  | 
 | ||||||
|  |         - var comps = getModelComponents(id) | ||||||
|  | 
 | ||||||
|  |         p(data-tpl=id data-tpl-key="description") | ||||||
|  | 
 | ||||||
|  |         div(data-tpl=id data-tpl-key="error" style="display: none") | ||||||
|  |             +infobox | ||||||
|  |                 |  Unable to load model details from GitHub. To find out more | ||||||
|  |                 |  about this model, see the overview of the | ||||||
|  |                 |  #[+a(gh("spacy-models") + "/releases") latest model releases]. | ||||||
|  | 
 | ||||||
|  |         +table(data-tpl=id data-tpl-key="table") | ||||||
|  |             +row | ||||||
|  |                 +cell #[+label Language] | ||||||
|  |                 +cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]} | ||||||
|  |             for comp, label in {"Type": comps.type, "Genre": comps.genre} | ||||||
|  |                 +row | ||||||
|  |                     +cell #[+label=label] | ||||||
|  |                     +cell #[+tag=comp] #{MODEL_META[comp]} | ||||||
|  |             +row | ||||||
|  |                 +cell #[+label Size] | ||||||
|  |                 +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]] | ||||||
|  | 
 | ||||||
|  |             each label in ["Pipeline", "Sources", "Author", "License"] | ||||||
|  |                 - var field = label.toLowerCase() | ||||||
|  |                 +row | ||||||
|  |                     +cell.u-nowrap | ||||||
|  |                         +label=label | ||||||
|  |                             if MODEL_META[field] | ||||||
|  |                                 |  #[+help(MODEL_META[field]).u-color-subtle] | ||||||
|  |                     +cell | ||||||
|  |                         span(data-tpl=id data-tpl-key=field) #[em n/a] | ||||||
|  | 
 | ||||||
|  |             +row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none") | ||||||
|  |                 +cell | ||||||
|  |                     +label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle] | ||||||
|  |                 +cell | ||||||
|  |                     .o-field.u-float-left | ||||||
|  |                         select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat") | ||||||
|  |                     .o-empty(data-tpl=id data-tpl-key="compat-versions")   | ||||||
|  | 
 | ||||||
|  |         section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none") | ||||||
|  |             +grid.o-no-block | ||||||
|  |                 +grid-col("third") | ||||||
|  |                     +h(4) Accuracy | ||||||
|  |                     +table.o-block-small | ||||||
|  |                         for label, field in MODEL_ACCURACY | ||||||
|  |                             +row(style="display: none") | ||||||
|  |                                 +cell.u-nowrap | ||||||
|  |                                     +label=label | ||||||
|  |                                         if MODEL_META[field] | ||||||
|  |                                             |  #[+help(MODEL_META[field]).u-color-subtle] | ||||||
|  |                                 +cell.u-text-right(data-tpl=id data-tpl-key=field) | ||||||
|  |                                     |  n/a | ||||||
|  | 
 | ||||||
|  |                 +grid-col("two-thirds") | ||||||
|  |                     +h(4) Comparison | ||||||
|  |                     +chart(id).u-padding-small | ||||||
|  | 
 | ||||||
|  |         p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes") | ||||||
|  | @ -1,27 +1,46 @@ | ||||||
| //- 💫 INCLUDES > SCRIPTS | //- 💫 INCLUDES > SCRIPTS | ||||||
| 
 | 
 | ||||||
| script(src="/assets/js/main.js?v#{V_JS}") | if quickstart | ||||||
| script(src="/assets/js/prism.js") |         script(src="/assets/js/quickstart.min.js") | ||||||
| 
 | 
 | ||||||
| if SECTION == "docs" | if IS_PAGE | ||||||
|     if quickstart |     script(src="/assets/js/in-view.min.js") | ||||||
|         script(src="/assets/js/quickstart.js") |  | ||||||
|         script var qs = new Quickstart("#qs") |  | ||||||
| 
 | 
 | ||||||
|     script. | if HAS_MODELS | ||||||
|         ((window.gitter = {}).chat = {}).options = { |     script(src="/assets/js/chart.min.js") | ||||||
|             useStyles: false, |  | ||||||
|             activationElement: '.js-gitter-button', |  | ||||||
|             targetElement: '.js-gitter', |  | ||||||
|             room: '!{SOCIAL.gitter}' |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) |  | ||||||
| 
 | 
 | ||||||
| if environment == "deploy" | if environment == "deploy" | ||||||
|     script |     script(async src="https://www.google-analytics.com/analytics.js") | ||||||
|  | 
 | ||||||
|  | script(src="/assets/js/prism.min.js") | ||||||
|  | script(src="/assets/js/main.js?v#{V_JS}") | ||||||
|  | 
 | ||||||
|  | script | ||||||
|  |     | new ProgressBar('.js-progress'); | ||||||
|  | 
 | ||||||
|  |     if changelog | ||||||
|  |         | new Changelog('!{SOCIAL.github}', 'spacy'); | ||||||
|  | 
 | ||||||
|  |     if quickstart | ||||||
|  |         | new Quickstart("#qs"); | ||||||
|  | 
 | ||||||
|  |     if IS_PAGE | ||||||
|  |         | new SectionHighlighter('data-section', 'data-nav'); | ||||||
|  |         | new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed'); | ||||||
|  |         | ((window.gitter = {}).chat = {}).options = { | ||||||
|  |         |     useStyles: false, | ||||||
|  |         |     activationElement: '.js-gitter-button', | ||||||
|  |         |     targetElement: '.js-gitter', | ||||||
|  |         |     room: '!{SOCIAL.gitter}' | ||||||
|  |         | }; | ||||||
|  | 
 | ||||||
|  |     if HAS_MODELS | ||||||
|  |         | new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)}); | ||||||
|  | 
 | ||||||
|  |     if environment == "deploy" | ||||||
|         | window.ga=window.ga||function(){ |         | window.ga=window.ga||function(){ | ||||||
|         | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; |         | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | ||||||
|         | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); |         | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); | ||||||
| 
 | 
 | ||||||
|     script(async src="https://www.google-analytics.com/analytics.js") | if IS_PAGE | ||||||
|  |     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) | ||||||
|  |  | ||||||
|  | @ -1,13 +1,23 @@ | ||||||
| //- 💫 INCLUDES > SIDEBAR | //- 💫 INCLUDES > SIDEBAR | ||||||
| 
 | 
 | ||||||
| include _mixins |  | ||||||
| 
 |  | ||||||
| menu.c-sidebar.js-sidebar.u-text | menu.c-sidebar.js-sidebar.u-text | ||||||
|     if sidebar_content |     if sidebar_content | ||||||
|         each items, menu in sidebar_content |         each items, sectiontitle in sidebar_content | ||||||
|             ul.c-sidebar__section.o-block |             ul.c-sidebar__section.o-block-small | ||||||
|                 li.u-text-label.u-color-subtle=menu |                 li.u-text-label.u-color-dark=sectiontitle | ||||||
| 
 | 
 | ||||||
|                 each url, item in items |                 each url, item in items | ||||||
|                     li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null) |                     - var is_current = CURRENT == url || (CURRENT == "index" && url == "./") | ||||||
|                         +a(url)=item |                     li.c-sidebar__item | ||||||
|  |                         +a(url)(class=is_current ? "is-active" : null)=item | ||||||
|  | 
 | ||||||
|  |                         if is_current | ||||||
|  |                             if IS_MODELS && CURRENT_MODELS.length | ||||||
|  |                                 - menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id }))) | ||||||
|  |                             if menu | ||||||
|  |                                 ul.c-sidebar__crumb.u-hidden-sm | ||||||
|  |                                     - var counter = 0 | ||||||
|  |                                     for id, title in menu | ||||||
|  |                                         - counter++ | ||||||
|  |                                         li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null) | ||||||
|  |                                             +a("#section-" + id)=title | ||||||
|  |  | ||||||
							
								
								
									
										157
									
								
								website/_includes/_svg.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								website/_includes/_svg.jade
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -2,11 +2,16 @@ | ||||||
| 
 | 
 | ||||||
| include _includes/_mixins | include _includes/_mixins | ||||||
| 
 | 
 | ||||||
|  | - title = IS_MODELS ? LANGUAGES[current.source] || title : title | ||||||
|  | - social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME | ||||||
|  | - social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg" | ||||||
|  | 
 | ||||||
| doctype html | doctype html | ||||||
| html(lang="en") | html(lang="en") | ||||||
|     title |     title | ||||||
|         if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" |         if SECTION == "api" || SECTION == "usage" || SECTION == "models" | ||||||
|             | #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation |             - var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1) | ||||||
|  |             | #{title} | #{SITENAME} #{title_section} Documentation | ||||||
| 
 | 
 | ||||||
|         else if SECTION != "index" |         else if SECTION != "index" | ||||||
|             | #{title} | #{SITENAME} |             | #{title} | #{SITENAME} | ||||||
|  | @ -22,32 +27,30 @@ html(lang="en") | ||||||
|     meta(property="og:type" content="website") |     meta(property="og:type" content="website") | ||||||
|     meta(property="og:site_name" content=sitename) |     meta(property="og:site_name" content=sitename) | ||||||
|     meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}") |     meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}") | ||||||
|     meta(property="og:title" content="#{title} - spaCy") |     meta(property="og:title" content=social_title) | ||||||
|     meta(property="og:description" content=description) |     meta(property="og:description" content=description) | ||||||
|     meta(property="og:image" content=getSocialImg()) |     meta(property="og:image" content=social_img) | ||||||
| 
 | 
 | ||||||
|     meta(name="twitter:card" content="summary_large_image") |     meta(name="twitter:card" content="summary_large_image") | ||||||
|     meta(name="twitter:site" content="@" + SOCIAL.twitter) |     meta(name="twitter:site" content="@" + SOCIAL.twitter) | ||||||
|     meta(name="twitter:title" content="#{title} - spaCy") |     meta(name="twitter:title" content=social_title) | ||||||
|     meta(name="twitter:description" content=description) |     meta(name="twitter:description" content=description) | ||||||
|     meta(name="twitter:image" content=getSocialImg()) |     meta(name="twitter:image" content=social_img) | ||||||
| 
 | 
 | ||||||
|     link(rel="shortcut icon" href="/assets/img/favicon.ico") |     link(rel="shortcut icon" href="/assets/img/favicon.ico") | ||||||
|     link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") |     link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") | ||||||
| 
 | 
 | ||||||
|     if ALPHA && SECTION == "docs" |     if SECTION == "api" | ||||||
|         link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet") |         link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet") | ||||||
| 
 | 
 | ||||||
|     else if SUBSECTION == "usage" |  | ||||||
|         link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet") |  | ||||||
| 
 |  | ||||||
|     else |     else | ||||||
|         link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet") |         link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet") | ||||||
| 
 | 
 | ||||||
|     body |     body | ||||||
|  |         include _includes/_svg | ||||||
|         include _includes/_navigation |         include _includes/_navigation | ||||||
| 
 | 
 | ||||||
|         if SECTION == "docs" |         if !landing | ||||||
|             include _includes/_page-docs |             include _includes/_page-docs | ||||||
| 
 | 
 | ||||||
|         else |         else | ||||||
|  |  | ||||||
							
								
								
									
										43
									
								
								website/api/_annotation/_biluo.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								website/api/_annotation/_biluo.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,43 @@ | ||||||
|  | //- 💫 DOCS > API > ANNOTATION > BILUO | ||||||
|  | 
 | ||||||
|  | +table([ "Tag", "Description" ]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code #[span.u-color-theme B] EGIN] | ||||||
|  |         +cell The first token of a multi-token entity. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code #[span.u-color-theme I] N] | ||||||
|  |         +cell An inner token of a multi-token entity. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code #[span.u-color-theme L] AST] | ||||||
|  |         +cell The final token of a multi-token entity. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code #[span.u-color-theme U] NIT] | ||||||
|  |         +cell A single-token entity. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code #[span.u-color-theme O] UT] | ||||||
|  |         +cell A non-entity token. | ||||||
|  | 
 | ||||||
|  | +aside("Why BILUO, not IOB?") | ||||||
|  |     |  There are several coding schemes for encoding entity annotations as | ||||||
|  |     |  token tags.  These coding schemes are equally expressive, but not | ||||||
|  |     |  necessarily equally learnable. | ||||||
|  |     |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth] | ||||||
|  |     |  showed that the minimal #[strong Begin], #[strong In], #[strong Out] | ||||||
|  |     |  scheme was more difficult to learn than the #[strong BILUO] scheme that | ||||||
|  |     |  we use, which explicitly marks boundary tokens. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  spaCy translates the character offsets into this scheme, in order to | ||||||
|  |     |  decide the cost of each action given the current state of the entity | ||||||
|  |     |  recogniser. The costs are then used to calculate the gradient of the | ||||||
|  |     |  loss, to train the model. The exact algorithm is a pastiche of | ||||||
|  |     |  well-known methods, and is not currently described in any single | ||||||
|  |     |  publication. The model is a greedy transition-based parser guided by a | ||||||
|  |     |  linear model whose weights are learned using the averaged perceptron | ||||||
|  |     |  loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] | ||||||
|  |     |  imitation learning strategy. The transition system is equivalent to the | ||||||
|  |     |  BILOU tagging scheme. | ||||||
							
								
								
									
										115
									
								
								website/api/_architecture/_cython.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								website/api/_architecture/_cython.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,115 @@ | ||||||
|  | //- 💫 DOCS > API > ARCHITECTURE > CYTHON | ||||||
|  | 
 | ||||||
|  | +aside("What's Cython?") | ||||||
|  |     |  #[+a("http://cython.org/") Cython] is a language for writing | ||||||
|  |     |  C extensions for Python. Most Python code is also valid Cython, but | ||||||
|  |     |  you can add type declarations to get efficient memory-managed code | ||||||
|  |     |  just like C or C++. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  spaCy's core data structures are implemented as | ||||||
|  |     |  #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is | ||||||
|  |     |  managed through the #[+a(gh("cymem")) #[code cymem]] | ||||||
|  |     |  #[code cymem.Pool] class, which allows you | ||||||
|  |     |  to allocate memory which will be freed when the #[code Pool] object | ||||||
|  |     |  is garbage collected. This means you usually don't have to worry | ||||||
|  |     |  about freeing memory. You just have to decide which Python object | ||||||
|  |     |  owns the memory, and make it own the #[code Pool]. When that object | ||||||
|  |     |  goes out of scope, the memory will be freed. You do have to take | ||||||
|  |     |  care that no pointers outlive the object that owns them — but this | ||||||
|  |     |  is generally quite easy. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  All Cython modules should have the #[code # cython: infer_types=True] | ||||||
|  |     |  compiler directive at the top of the file. This makes the code much | ||||||
|  |     |  cleaner, as it avoids the need for many type declarations. If | ||||||
|  |     |  possible, you should prefer to declare your functions #[code nogil], | ||||||
|  |     |  even if you don't especially care about multi-threading. The reason | ||||||
|  |     |  is that #[code nogil] functions help the Cython compiler reason about | ||||||
|  |     |  your code quite a lot — you're telling the compiler that no Python | ||||||
|  |     |  dynamics are possible. This lets many errors be raised, and ensures | ||||||
|  |     |  your function will run at C speed. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Cython gives you many choices of sequences: you could have a Python | ||||||
|  |     |  list, a numpy array, a memory view, a C++ vector, or a pointer. | ||||||
|  |     |  Pointers are preferred, because they are fastest, have the most | ||||||
|  |     |  explicit semantics, and let the compiler check your code more | ||||||
|  |     |  strictly. C++ vectors are also great — but you should only use them | ||||||
|  |     |  internally in functions. It's less friendly to accept a vector as an | ||||||
|  |     |  argument, because that asks the user to do much more work. Here's | ||||||
|  |     |  how to get a pointer from a numpy array, memory view or vector: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil: | ||||||
|  |     pointer1 = <int*>numpy_array.data | ||||||
|  |     pointer2 = cpp_vector.data() | ||||||
|  |     pointer3 = &memory_view[0] | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Both C arrays and C++ vectors reassure the compiler that no Python | ||||||
|  |     |  operations are possible on your variable. This is a big advantage: | ||||||
|  |     |  it lets the Cython compiler raise many more errors for you. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  When getting a pointer from a numpy array or memoryview, take care | ||||||
|  |     |  that the data is actually stored in C-contiguous order — otherwise | ||||||
|  |     |  you'll get a pointer to nonsense. The type-declarations in the code | ||||||
|  |     |  above should generate runtime errors if buffers with incorrect | ||||||
|  |     |  memory layouts are passed in. To iterate over the array, the | ||||||
|  |     |  following style is preferred: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     cdef int c_total(const int* int_array, int length) nogil: | ||||||
|  |         total = 0 | ||||||
|  |         for item in int_array[:length]: | ||||||
|  |             total += item | ||||||
|  |         return total | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  If this is confusing, consider that the compiler couldn't deal with | ||||||
|  |     |  #[code for item in int_array:] — there's no length attached to a raw | ||||||
|  |     |  pointer, so how could we figure out where to stop? The length is | ||||||
|  |     |  provided in the slice notation as a solution to this. Note that we | ||||||
|  |     |  don't have to declare the type of #[code item] in the code above — | ||||||
|  |     |  the compiler can easily infer it. This gives us tidy code that looks | ||||||
|  |     |  quite like Python, but is exactly as fast as C — because we've made | ||||||
|  |     |  sure the compilation to C is trivial. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Your functions cannot be declared #[code nogil] if they need to | ||||||
|  |     |  create Python objects or call Python functions. This is perfectly | ||||||
|  |     |  okay — you shouldn't torture your code just to get #[code nogil] | ||||||
|  |     |  functions. However, if your function isn't #[code nogil], you should | ||||||
|  |     |  compile your module with #[code cython -a --cplus my_module.pyx] and | ||||||
|  |     |  open the resulting #[code my_module.html] file in a browser. This | ||||||
|  |     |  will let you see how Cython is compiling your code. Calls into the | ||||||
|  |     |  Python run-time will be in bright yellow. This lets you easily see | ||||||
|  |     |  whether Cython is able to correctly type your code, or whether there | ||||||
|  |     |  are unexpected problems. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Working in Cython is very rewarding once you're over the initial | ||||||
|  |     |  learning curve. As with C and C++, the first way you write something | ||||||
|  |     |  in Cython will often be the performance-optimal approach. In | ||||||
|  |     |  contrast, Python optimisation generally requires a lot of | ||||||
|  |     |  experimentation. Is it faster to have an #[code if item in my_dict] | ||||||
|  |     |  check, or to use #[code .get()]? What about | ||||||
|  |     |  #[code try]/#[code except]? Does this numpy operation create a copy? | ||||||
|  |     |  There's no way to guess the answers to these questions, and you'll | ||||||
|  |     |  usually be dissatisfied with your results — so there's no way to | ||||||
|  |     |  know when to stop this process. In the worst case, you'll make a | ||||||
|  |     |  mess that invites the next reader to try their luck too. This is | ||||||
|  |     |  like one of those | ||||||
|  |     |  #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps], | ||||||
|  |     |  where the rescuers keep passing out from low oxygen, causing | ||||||
|  |     |  another rescuer to follow — only to succumb themselves. In short, | ||||||
|  |     |  just say no to optimizing your Python. If it's not fast enough the | ||||||
|  |     |  first time, just switch to Cython. | ||||||
|  | 
 | ||||||
|  | +infobox("Resources") | ||||||
|  |     +list.o-no-block | ||||||
|  |         +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org) | ||||||
|  |         +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai) | ||||||
|  |         +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai) | ||||||
							
								
								
									
										141
									
								
								website/api/_architecture/_nn-model.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								website/api/_architecture/_nn-model.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,141 @@ | ||||||
|  | //- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  The parsing model is a blend of recent results. The two recent | ||||||
|  |     |  inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at | ||||||
|  |     |  Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of | ||||||
|  |     |  the parser is still based on the work of Joakim Nivre#[+fn(2)], who | ||||||
|  |     |  introduced the transition-based framework#[+fn(3)], the arc-eager | ||||||
|  |     |  transition system, and the imitation learning objective. The model is | ||||||
|  |     |  implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning | ||||||
|  |     |  library. We first predict context-sensitive vectors for each word in the | ||||||
|  |     |  input: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     (embed_lower | embed_prefix | embed_suffix | embed_shape) | ||||||
|  |         >> Maxout(token_width) | ||||||
|  |         >> convolution ** 4 | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This convolutional layer is shared between the tagger, parser and NER, | ||||||
|  |     |  and will also be shared by the future neural lemmatizer. Because the | ||||||
|  |     |  parser shares these layers with the tagger, the parser does not require | ||||||
|  |     |  tag features. I got this trick from David Weiss's "Stack Combination" | ||||||
|  |     |  paper#[+fn(4)]. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To boost the representation, the tagger actually predicts a "super tag" | ||||||
|  |     |  with POS, morphology and dependency label#[+fn(5)]. The tagger predicts | ||||||
|  |     |  these supertags by adding a softmax layer onto the convolutional layer – | ||||||
|  |     |  so, we're teaching the convolutional layer to give us a representation | ||||||
|  |     |  that's one affine transform from this informative lexical information. | ||||||
|  |     |  This is obviously good for the parser (which backprops to the | ||||||
|  |     |  convolutions too). The parser model makes a state vector by concatenating | ||||||
|  |     |  the vector representations for its context tokens.  The current context | ||||||
|  |     |  tokens: | ||||||
|  | 
 | ||||||
|  | +table | ||||||
|  |     +row | ||||||
|  |         +cell #[code S0], #[code S1], #[code S2] | ||||||
|  |         +cell Top three words on the stack. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code B0], #[code B1] | ||||||
|  |         +cell First two words of the buffer. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell.u-nowrap | ||||||
|  |             |  #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1], | ||||||
|  |             |  #[code B1L1]#[br] | ||||||
|  |             |  #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2], | ||||||
|  |             |  #[code B1L2] | ||||||
|  |         +cell | ||||||
|  |             |  Leftmost and second leftmost children of #[code S0], #[code S1], | ||||||
|  |             |  #[code S2], #[code B0] and #[code B1]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell.u-nowrap | ||||||
|  |             |  #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1], | ||||||
|  |             |  #[code B1R1]#[br] | ||||||
|  |             |  #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2], | ||||||
|  |             |  #[code B1R2] | ||||||
|  |         +cell | ||||||
|  |             |  Rightmost and second rightmost children of #[code S0], #[code S1], | ||||||
|  |             |  #[code S2], #[code B0] and #[code B1]. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This makes the state vector quite long: #[code 13*T], where #[code T] is | ||||||
|  |     |  the token vector width (128 is working well). Fortunately, there's a way | ||||||
|  |     |  to structure the computation to save some expense (and make it more | ||||||
|  |     |  GPU-friendly). | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  The parser typically visits #[code 2*N] states for a sentence of length | ||||||
|  |     |  #[code N] (although it may visit more, if it back-tracks with a | ||||||
|  |     |  non-monotonic transition#[+fn(4)]). A naive implementation would require | ||||||
|  |     |  #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of | ||||||
|  |     |  size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)] | ||||||
|  |     |  multiplication, to pre-compute the hidden weights for each positional | ||||||
|  |     |  feature with respect to the words in the batch. (Note that our token | ||||||
|  |     |  vectors come from the CNN — so we can't play this trick over the | ||||||
|  |     |  vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its | ||||||
|  |     |  model is so big.) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This pre-computation strategy allows a nice compromise between | ||||||
|  |     |  GPU-friendliness and implementation simplicity. The CNN and the wide | ||||||
|  |     |  lower layer are computed on the GPU, and then the precomputed hidden | ||||||
|  |     |  weights are moved to the CPU, before we start the transition-based | ||||||
|  |     |  parsing process. This makes a lot of things much easier. We don't have to | ||||||
|  |     |  worry about variable-length batch sizes, and we don't have to implement | ||||||
|  |     |  the dynamic oracle in CUDA to train. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Currently the parser's loss function is multilabel log loss#[+fn(6)], as | ||||||
|  |     |  the dynamic oracle allows multiple states to be 0 cost. This is defined | ||||||
|  |     |  as follows, where #[code gZ] is the sum of the scores assigned to gold | ||||||
|  |     |  classes: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     (exp(score) / Z) - (exp(score) / gZ) | ||||||
|  | 
 | ||||||
|  | +bibliography | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations] | ||||||
|  |         br | ||||||
|  |         |  Eliyahu Kiperwasser, Yoav Goldberg. (2016) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing] | ||||||
|  |         br | ||||||
|  |         |  Yoav Goldberg, Joakim Nivre (2012) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python] | ||||||
|  |         br | ||||||
|  |         |  Matthew Honnibal (2013) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax] | ||||||
|  |         br | ||||||
|  |         |  Yuan Zhang, David Weiss (2016) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers] | ||||||
|  |         br | ||||||
|  |         |  Anders Søgaard, Yoav Goldberg (2016) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing] | ||||||
|  |         br | ||||||
|  |         |  Matthew Honnibal, Mark Johnson (2015) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks] | ||||||
|  |         br | ||||||
|  |         |  Danqi Cheng, Christopher D. Manning (2014) | ||||||
|  | 
 | ||||||
|  |     +item | ||||||
|  |         |  #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques] | ||||||
|  |         br | ||||||
|  |         |  Stefan Riezler et al. (2002) | ||||||
|  | @ -1,29 +1,32 @@ | ||||||
| { | { | ||||||
|     "sidebar": { |     "sidebar": { | ||||||
|         "Introduction": { |         "Overview": { | ||||||
|             "Facts & Figures": "./", |             "Architecture": "./", | ||||||
|             "Languages": "language-models", |             "Annotation Specs": "annotation", | ||||||
|             "Annotation Specs": "annotation" |             "Functions": "top-level" | ||||||
|         }, |         }, | ||||||
|         "Top-level": { |         "Containers": { | ||||||
|             "spacy": "spacy", |  | ||||||
|             "displacy": "displacy", |  | ||||||
|             "Utility Functions": "util", |  | ||||||
|             "Command line": "cli" |  | ||||||
|         }, |  | ||||||
|         "Classes": { |  | ||||||
|             "Doc": "doc", |             "Doc": "doc", | ||||||
|             "Token": "token", |             "Token": "token", | ||||||
|             "Span": "span", |             "Span": "span", | ||||||
|  |             "Lexeme": "lexeme" | ||||||
|  |         }, | ||||||
|  | 
 | ||||||
|  |         "Pipeline": { | ||||||
|             "Language": "language", |             "Language": "language", | ||||||
|             "Tokenizer": "tokenizer", |             "Pipe": "pipe", | ||||||
|             "Tensorizer": "tensorizer", |             "Tensorizer": "tensorizer", | ||||||
|             "Tagger": "tagger", |             "Tagger": "tagger", | ||||||
|             "DependencyParser": "dependencyparser", |             "DependencyParser": "dependencyparser", | ||||||
|             "EntityRecognizer": "entityrecognizer", |             "EntityRecognizer": "entityrecognizer", | ||||||
|             "TextCategorizer": "textcategorizer", |             "TextCategorizer": "textcategorizer", | ||||||
|  |             "Tokenizer": "tokenizer", | ||||||
|  |             "Lemmatizer": "lemmatizer", | ||||||
|             "Matcher": "matcher", |             "Matcher": "matcher", | ||||||
|             "Lexeme": "lexeme", |             "PhraseMatcher": "phrasematcher" | ||||||
|  |         }, | ||||||
|  | 
 | ||||||
|  |         "Other": { | ||||||
|             "Vocab": "vocab", |             "Vocab": "vocab", | ||||||
|             "StringStore": "stringstore", |             "StringStore": "stringstore", | ||||||
|             "Vectors": "vectors", |             "Vectors": "vectors", | ||||||
|  | @ -34,52 +37,37 @@ | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "index": { |     "index": { | ||||||
|         "title": "Facts & Figures", |         "title": "Architecture", | ||||||
|         "next": "language-models" |         "next": "annotation", | ||||||
|  |         "menu": { | ||||||
|  |             "Basics": "basics", | ||||||
|  |             "Neural Network Model": "nn-model", | ||||||
|  |             "Cython Conventions": "cython" | ||||||
|  |         } | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "language-models": { |     "top-level": { | ||||||
|         "title": "Languages", |         "title": "Top-level Functions", | ||||||
|         "next": "philosophy" |         "menu": { | ||||||
|     }, |             "spacy": "spacy", | ||||||
| 
 |             "displacy": "displacy", | ||||||
|     "philosophy": { |             "Utility Functions": "util", | ||||||
|         "title": "Philosophy" |             "Compatibility": "compat", | ||||||
|     }, |             "Command Line": "cli" | ||||||
| 
 |         } | ||||||
|     "spacy": { |  | ||||||
|         "title": "spaCy top-level functions", |  | ||||||
|         "source": "spacy/__init__.py", |  | ||||||
|         "next": "displacy" |  | ||||||
|     }, |  | ||||||
| 
 |  | ||||||
|     "displacy": { |  | ||||||
|         "title": "displaCy", |  | ||||||
|         "tag": "module", |  | ||||||
|         "source": "spacy/displacy", |  | ||||||
|         "next": "util" |  | ||||||
|     }, |  | ||||||
| 
 |  | ||||||
|     "util": { |  | ||||||
|         "title": "Utility Functions", |  | ||||||
|         "source": "spacy/util.py", |  | ||||||
|         "next": "cli" |  | ||||||
|     }, |  | ||||||
| 
 |  | ||||||
|     "cli": { |  | ||||||
|         "title": "Command Line Interface", |  | ||||||
|         "source": "spacy/cli" |  | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "language": { |     "language": { | ||||||
|         "title": "Language", |         "title": "Language", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "teaser": "A text-processing pipeline.", | ||||||
|         "source": "spacy/language.py" |         "source": "spacy/language.py" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "doc": { |     "doc": { | ||||||
|         "title": "Doc", |         "title": "Doc", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "teaser": "A container for accessing linguistic annotations.", | ||||||
|         "source": "spacy/tokens/doc.pyx" |         "source": "spacy/tokens/doc.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  | @ -103,6 +91,7 @@ | ||||||
| 
 | 
 | ||||||
|     "vocab": { |     "vocab": { | ||||||
|         "title": "Vocab", |         "title": "Vocab", | ||||||
|  |         "teaser": "A storage class for vocabulary and other data shared across a language.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|         "source": "spacy/vocab.pyx" |         "source": "spacy/vocab.pyx" | ||||||
|     }, |     }, | ||||||
|  | @ -115,10 +104,27 @@ | ||||||
| 
 | 
 | ||||||
|     "matcher": { |     "matcher": { | ||||||
|         "title": "Matcher", |         "title": "Matcher", | ||||||
|  |         "teaser": "Match sequences of tokens, based on pattern rules.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|         "source": "spacy/matcher.pyx" |         "source": "spacy/matcher.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |     "phrasematcher": { | ||||||
|  |         "title": "PhraseMatcher", | ||||||
|  |         "teaser": "Match sequences of tokens, based on documents.", | ||||||
|  |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|  |         "source": "spacy/matcher.pyx" | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|  |     "pipe": { | ||||||
|  |         "title": "Pipe", | ||||||
|  |         "teaser": "Abstract base class defining the API for pipeline components.", | ||||||
|  |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|  |         "source": "spacy/pipeline.pyx" | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|     "dependenyparser": { |     "dependenyparser": { | ||||||
|         "title": "DependencyParser", |         "title": "DependencyParser", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  | @ -127,18 +133,22 @@ | ||||||
| 
 | 
 | ||||||
|     "entityrecognizer": { |     "entityrecognizer": { | ||||||
|         "title": "EntityRecognizer", |         "title": "EntityRecognizer", | ||||||
|  |         "teaser": "Annotate named entities on documents.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|         "source": "spacy/pipeline.pyx" |         "source": "spacy/pipeline.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "textcategorizer": { |     "textcategorizer": { | ||||||
|         "title": "TextCategorizer", |         "title": "TextCategorizer", | ||||||
|  |         "teaser": "Add text categorization models to spaCy pipelines.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|         "source": "spacy/pipeline.pyx" |         "source": "spacy/pipeline.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "dependencyparser": { |     "dependencyparser": { | ||||||
|         "title": "DependencyParser", |         "title": "DependencyParser", | ||||||
|  |         "teaser": "Annotate syntactic dependencies on documents.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|         "source": "spacy/pipeline.pyx" |         "source": "spacy/pipeline.pyx" | ||||||
|     }, |     }, | ||||||
|  | @ -149,15 +159,23 @@ | ||||||
|         "source": "spacy/tokenizer.pyx" |         "source": "spacy/tokenizer.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |     "lemmatizer": { | ||||||
|  |         "title": "Lemmatizer", | ||||||
|  |         "tag": "class" | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|     "tagger": { |     "tagger": { | ||||||
|         "title": "Tagger", |         "title": "Tagger", | ||||||
|  |         "teaser": "Annotate part-of-speech tags on documents.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|         "source": "spacy/pipeline.pyx" |         "source": "spacy/pipeline.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "tensorizer": { |     "tensorizer": { | ||||||
|         "title": "Tensorizer", |         "title": "Tensorizer", | ||||||
|  |         "teaser": "Add a tensor with position-sensitive meaning representations to a document.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|         "source": "spacy/pipeline.pyx" |         "source": "spacy/pipeline.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  | @ -169,23 +187,38 @@ | ||||||
| 
 | 
 | ||||||
|     "goldcorpus": { |     "goldcorpus": { | ||||||
|         "title": "GoldCorpus", |         "title": "GoldCorpus", | ||||||
|  |         "teaser": "An annotated corpus, using the JSON file format.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|         "source": "spacy/gold.pyx" |         "source": "spacy/gold.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "binder": { |     "binder": { | ||||||
|         "title": "Binder", |         "title": "Binder", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|         "source": "spacy/tokens/binder.pyx" |         "source": "spacy/tokens/binder.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "vectors": { |     "vectors": { | ||||||
|         "title": "Vectors", |         "title": "Vectors", | ||||||
|  |         "teaser": "Store, save and load word vectors.", | ||||||
|         "tag": "class", |         "tag": "class", | ||||||
|  |         "tag_new": 2, | ||||||
|         "source": "spacy/vectors.pyx" |         "source": "spacy/vectors.pyx" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "annotation": { |     "annotation": { | ||||||
|         "title": "Annotation Specifications" |         "title": "Annotation Specifications", | ||||||
|  |         "teaser": "Schemes used for labels, tags and training data.", | ||||||
|  |         "menu": { | ||||||
|  |             "Tokenization": "tokenization", | ||||||
|  |             "Sentence Boundaries": "sbd", | ||||||
|  |             "POS Tagging": "pos-tagging", | ||||||
|  |             "Lemmatization": "lemmatization", | ||||||
|  |             "Dependencies": "dependency-parsing", | ||||||
|  |             "Named Entities": "named-entities", | ||||||
|  |             "Training Data": "training" | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | @ -1,26 +1,17 @@ | ||||||
| //- 💫 DOCS > USAGE > COMMAND LINE INTERFACE | //- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE | ||||||
| 
 |  | ||||||
| include ../../_includes/_mixins |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  As of v1.7.0, spaCy comes with new command line helpers to download and |     |  As of v1.7.0, spaCy comes with new command line helpers to download and | ||||||
|     |  link models and show useful debugging information. For a list of available |     |  link models and show useful debugging information. For a list of available | ||||||
|     |  commands, type #[code spacy --help]. |     |  commands, type #[code spacy --help]. | ||||||
| 
 | 
 | ||||||
| +infobox("⚠️ Deprecation note") | +h(3, "download") Download | ||||||
|     |  As of spaCy 2.0, the #[code model] command to initialise a model data |  | ||||||
|     |  directory is deprecated. The command was only necessary because previous |  | ||||||
|     |  versions of spaCy expected a model directory to already be set up. This |  | ||||||
|     |  has since been changed, so you can use the #[+api("cli#train") #[code train]] |  | ||||||
|     |  command straight away. |  | ||||||
| 
 |  | ||||||
| +h(2, "download") Download |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the |     |  Download #[+a("/usage/models") models] for spaCy. The downloader finds the | ||||||
|     |  best-matching compatible version, uses pip to download the model as a |     |  best-matching compatible version, uses pip to download the model as a | ||||||
|     |  package and automatically creates a |     |  package and automatically creates a | ||||||
|     |  #[+a("/docs/usage/models#usage") shortcut link] to load the model by name. |     |  #[+a("/usage/models#usage") shortcut link] to load the model by name. | ||||||
|     |  Direct downloads don't perform any compatibility checks and require the |     |  Direct downloads don't perform any compatibility checks and require the | ||||||
|     |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). |     |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). | ||||||
| 
 | 
 | ||||||
|  | @ -49,15 +40,15 @@ p | ||||||
|     |  detailed messages in case things go wrong. It's #[strong not recommended] |     |  detailed messages in case things go wrong. It's #[strong not recommended] | ||||||
|     |  to use this command as part of an automated process. If you know which |     |  to use this command as part of an automated process. If you know which | ||||||
|     |  model your project needs, you should consider a |     |  model your project needs, you should consider a | ||||||
|     |  #[+a("/docs/usage/models#download-pip") direct download via pip], or |     |  #[+a("/usage/models#download-pip") direct download via pip], or | ||||||
|     |  uploading the model to a local PyPi installation and fetching it straight |     |  uploading the model to a local PyPi installation and fetching it straight | ||||||
|     |  from there. This will also allow you to add it as a versioned package |     |  from there. This will also allow you to add it as a versioned package | ||||||
|     |  dependency to your project. |     |  dependency to your project. | ||||||
| 
 | 
 | ||||||
| +h(2, "link") Link | +h(3, "link") Link | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Create a #[+a("/docs/usage/models#usage") shortcut link] for a model, |     |  Create a #[+a("/usage/models#usage") shortcut link] for a model, | ||||||
|     |  either a Python package or a local directory. This will let you load |     |  either a Python package or a local directory. This will let you load | ||||||
|     |  models from any location using a custom name via |     |  models from any location using a custom name via | ||||||
|     |  #[+api("spacy#load") #[code spacy.load()]]. |     |  #[+api("spacy#load") #[code spacy.load()]]. | ||||||
|  | @ -95,7 +86,7 @@ p | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
| 
 | 
 | ||||||
| +h(2, "info") Info | +h(3, "info") Info | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Print information about your spaCy installation, models and local setup, |     |  Print information about your spaCy installation, models and local setup, | ||||||
|  | @ -122,15 +113,15 @@ p | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
| 
 | 
 | ||||||
| +h(2, "convert") Convert | +h(3, "convert") Convert | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] |     |  Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format] | ||||||
|     |  for use with the #[code train] command and other experiment management |     |  for use with the #[code train] command and other experiment management | ||||||
|     |  functions. The right converter is chosen based on the file extension of |     |  functions. The right converter is chosen based on the file extension of | ||||||
|     |  the input file. Currently only supports #[code .conllu]. |     |  the input file. Currently only supports #[code .conllu]. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash", "$"). | +code(false, "bash", "$", false, false, true). | ||||||
|     spacy convert [input_file] [output_dir] [--n-sents] [--morphology] |     spacy convert [input_file] [output_dir] [--n-sents] [--morphology] | ||||||
| 
 | 
 | ||||||
| +table(["Argument", "Type", "Description"]) | +table(["Argument", "Type", "Description"]) | ||||||
|  | @ -159,14 +150,18 @@ p | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
| 
 | 
 | ||||||
| +h(2, "train") Train | +h(3, "train") Train | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Train a model. Expects data in spaCy's |     |  Train a model. Expects data in spaCy's | ||||||
|     |  #[+a("/docs/api/annotation#json-input") JSON format]. |     |  #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model | ||||||
|  |     |  will be saved out to the directory. Accuracy scores and model details | ||||||
|  |     |  will be added to a #[+a("/usage/training#models-generating") #[code meta.json]] | ||||||
|  |     |  to allow packaging the model using the | ||||||
|  |     |  #[+api("cli#package") #[code package]] command. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash", "$"). | +code(false, "bash", "$", false, false, true). | ||||||
|     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] |     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc] | ||||||
| 
 | 
 | ||||||
| +table(["Argument", "Type", "Description"]) | +table(["Argument", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|  | @ -204,6 +199,27 @@ p | ||||||
|         +cell option |         +cell option | ||||||
|         +cell Use GPU. |         +cell Use GPU. | ||||||
| 
 | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --vectors], #[code -v] | ||||||
|  |         +cell option | ||||||
|  |         +cell Model to load vectors from. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --meta-path], #[code -m] | ||||||
|  |         +cell option | ||||||
|  |         +cell | ||||||
|  |             |  #[+tag-new(2)] Optional path to model | ||||||
|  |             |  #[+a("/usage/training#models-generating") #[code meta.json]]. | ||||||
|  |             |  All relevant properties like #[code lang], #[code pipeline] and | ||||||
|  |             |  #[code spacy_version] will be overwritten. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --version], #[code -V] | ||||||
|  |         +cell option | ||||||
|  |         +cell | ||||||
|  |             |  Model version. Will be written out to the model's | ||||||
|  |             |  #[code meta.json] after training. | ||||||
|  | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code --no-tagger], #[code -T] |         +cell #[code --no-tagger], #[code -T] | ||||||
|         +cell flag |         +cell flag | ||||||
|  | @ -219,12 +235,18 @@ p | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Don't train NER. |         +cell Don't train NER. | ||||||
| 
 | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --gold-preproc], #[code -G] | ||||||
|  |         +cell flag | ||||||
|  |         +cell Use gold preprocessing. | ||||||
|  | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code --help], #[code -h] |         +cell #[code --help], #[code -h] | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell Show help message and available arguments. |         +cell Show help message and available arguments. | ||||||
| 
 | 
 | ||||||
| +h(3, "train-hyperparams") Environment variables for hyperparameters | +h(4, "train-hyperparams") Environment variables for hyperparameters | ||||||
|  |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy lets you set hyperparameters for training via environment variables. |     |  spaCy lets you set hyperparameters for training via environment variables. | ||||||
|  | @ -236,98 +258,149 @@ p | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     parser_hidden_depth=2 parser_maxout_pieces=1 train-parser |     parser_hidden_depth=2 parser_maxout_pieces=1 train-parser | ||||||
| 
 | 
 | ||||||
| +under-construction |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Description", "Default"]) | +table(["Name", "Description", "Default"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code dropout_from] |         +cell #[code dropout_from] | ||||||
|         +cell |         +cell Initial dropout rate. | ||||||
|         +cell #[code 0.2] |         +cell #[code 0.2] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code dropout_to] |         +cell #[code dropout_to] | ||||||
|         +cell |         +cell Final dropout rate. | ||||||
|         +cell #[code 0.2] |         +cell #[code 0.2] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code dropout_decay] |         +cell #[code dropout_decay] | ||||||
|         +cell |         +cell Rate of dropout change. | ||||||
|         +cell #[code 0.0] |         +cell #[code 0.0] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code batch_from] |         +cell #[code batch_from] | ||||||
|         +cell |         +cell Initial batch size. | ||||||
|         +cell #[code 1] |         +cell #[code 1] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code batch_to] |         +cell #[code batch_to] | ||||||
|         +cell |         +cell Final batch size. | ||||||
|         +cell #[code 64] |         +cell #[code 64] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code batch_compound] |         +cell #[code batch_compound] | ||||||
|         +cell |         +cell Rate of batch size acceleration. | ||||||
|         +cell #[code 1.001] |         +cell #[code 1.001] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code token_vector_width] |         +cell #[code token_vector_width] | ||||||
|         +cell |         +cell Width of embedding tables and convolutional layers. | ||||||
|         +cell #[code 128] |         +cell #[code 128] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code embed_size] |         +cell #[code embed_size] | ||||||
|         +cell |         +cell Number of rows in embedding tables. | ||||||
|         +cell #[code 7500] |         +cell #[code 7500] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code parser_maxout_pieces] |         +cell #[code parser_maxout_pieces] | ||||||
|         +cell |         +cell Number of pieces in the parser's and NER's first maxout layer. | ||||||
|         +cell #[code 2] |         +cell #[code 2] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code parser_hidden_depth] |         +cell #[code parser_hidden_depth] | ||||||
|         +cell |         +cell Number of hidden layers in the parser and NER. | ||||||
|         +cell #[code 1] |         +cell #[code 1] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code hidden_width] |         +cell #[code hidden_width] | ||||||
|         +cell |         +cell Size of the parser's and NER's hidden layers. | ||||||
|         +cell #[code 128] |         +cell #[code 128] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code learn_rate] |         +cell #[code learn_rate] | ||||||
|         +cell |         +cell Learning rate. | ||||||
|         +cell #[code 0.001] |         +cell #[code 0.001] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code optimizer_B1] |         +cell #[code optimizer_B1] | ||||||
|         +cell |         +cell Momentum for the Adam solver. | ||||||
|         +cell #[code 0.9] |         +cell #[code 0.9] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code optimizer_B2] |         +cell #[code optimizer_B2] | ||||||
|         +cell |         +cell Adagrad-momentum for the Adam solver. | ||||||
|         +cell #[code 0.999] |         +cell #[code 0.999] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code optimizer_eps] |         +cell #[code optimizer_eps] | ||||||
|         +cell |         +cell Epsylon value for the Adam solver. | ||||||
|         +cell #[code 1e-08] |         +cell #[code 1e-08] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code L2_penalty] |         +cell #[code L2_penalty] | ||||||
|         +cell |         +cell L2 regularisation penalty. | ||||||
|         +cell #[code 1e-06] |         +cell #[code 1e-06] | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code grad_norm_clip] |         +cell #[code grad_norm_clip] | ||||||
|         +cell |         +cell Gradient L2 norm constraint. | ||||||
|         +cell #[code 1.0] |         +cell #[code 1.0] | ||||||
| 
 | 
 | ||||||
| +h(2, "package") Package | +h(3, "evaluate") Evaluate | ||||||
|  |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] |     |  Evaluate a model's accuracy and speed on JSON-formatted annotated data. | ||||||
|  |     |  Will print the results and optionally export | ||||||
|  |     |  #[+a("/usage/visualizers") displaCy visualizations] of a sample set of | ||||||
|  |     |  parses to #[code .html] files. Visualizations for the dependency parse | ||||||
|  |     |  and NER will be exported as separate files if the respective component | ||||||
|  |     |  is present in the model's pipeline. | ||||||
|  | 
 | ||||||
|  | +code(false, "bash", "$", false, false, true). | ||||||
|  |     spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc] | ||||||
|  | 
 | ||||||
|  | +table(["Argument", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code model] | ||||||
|  |         +cell positional | ||||||
|  |         +cell | ||||||
|  |             |  Model to evaluate. Can be a package or shortcut link name, or a | ||||||
|  |             |  path to a model data directory. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code data_path] | ||||||
|  |         +cell positional | ||||||
|  |         +cell Location of JSON-formatted evaluation data. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --displacy-path], #[code -dp] | ||||||
|  |         +cell option | ||||||
|  |         +cell | ||||||
|  |             |  Directory to output rendered parses as HTML. If not set, no | ||||||
|  |             |  visualizations will be generated. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --displacy-limit], #[code -dl] | ||||||
|  |         +cell option | ||||||
|  |         +cell | ||||||
|  |             |  Number of parses to generate per file. Defaults to #[code 25]. | ||||||
|  |             |  Keep in mind that a significantly higher number might cause the | ||||||
|  |             |  #[code .html] files to render slowly. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --gpu-id], #[code -g] | ||||||
|  |         +cell option | ||||||
|  |         +cell GPU to use, if any. Defaults to #[code -1] for CPU. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code --gold-preproc], #[code -G] | ||||||
|  |         +cell flag | ||||||
|  |         +cell Use gold preprocessing. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(3, "package") Package | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Generate a #[+a("/usage/training#models-generating") model Python package] | ||||||
|     |  from an existing model data directory. All data files are copied over. |     |  from an existing model data directory. All data files are copied over. | ||||||
|     |  If the path to a meta.json is supplied, or a meta.json is found in the |     |  If the path to a meta.json is supplied, or a meta.json is found in the | ||||||
|     |  input directory, this file is used. Otherwise, the data can be entered |     |  input directory, this file is used. Otherwise, the data can be entered | ||||||
|  | @ -336,8 +409,8 @@ p | ||||||
|     |  sure you're always using the latest versions. This means you need to be |     |  sure you're always using the latest versions. This means you need to be | ||||||
|     |  connected to the internet to use this command. |     |  connected to the internet to use this command. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash", "$"). | +code(false, "bash", "$", false, false, true). | ||||||
|     spacy package [input_dir] [output_dir] [--meta] [--force] |     spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] | ||||||
| 
 | 
 | ||||||
| +table(["Argument", "Type", "Description"]) | +table(["Argument", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|  | @ -353,14 +426,14 @@ p | ||||||
|     +row |     +row | ||||||
|         +cell #[code --meta-path], #[code -m] |         +cell #[code --meta-path], #[code -m] | ||||||
|         +cell option |         +cell option | ||||||
|         +cell Path to meta.json file (optional). |         +cell #[+tag-new(2)] Path to meta.json file (optional). | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code --create-meta], #[code -c] |         +cell #[code --create-meta], #[code -c] | ||||||
|         +cell flag |         +cell flag | ||||||
|         +cell |         +cell | ||||||
|             |  Create a meta.json file on the command line, even if one already |             |  #[+tag-new(2)] Create a meta.json file on the command line, even | ||||||
|             |  exists in the directory. |             |  if one already exists in the directory. | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code --force], #[code -f] |         +cell #[code --force], #[code -f] | ||||||
							
								
								
									
										91
									
								
								website/api/_top-level/_compat.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								website/api/_top-level/_compat.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,91 @@ | ||||||
|  | //- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  All Python code is written in an | ||||||
|  |     |  #[strong intersection of Python 2 and Python 3]. This is easy in Cython, | ||||||
|  |     |  but somewhat ugly in Python. Logic that deals with Python or platform | ||||||
|  |     |  compatibility only lives in #[code spacy.compat]. To distinguish them from | ||||||
|  |     |  the builtin functions, replacement functions are suffixed with an | ||||||
|  |     |  undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the | ||||||
|  |     |  #[code six] and #[code ftfy] packages. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.compat import unicode_, json_dumps | ||||||
|  | 
 | ||||||
|  |     compatible_unicode = unicode_('hello world') | ||||||
|  |     compatible_json = json_dumps({'key': 'value'}) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Python 2", "Python 3"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.bytes_] | ||||||
|  |         +cell #[code str] | ||||||
|  |         +cell #[code bytes] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.unicode_] | ||||||
|  |         +cell #[code unicode] | ||||||
|  |         +cell #[code str] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.basestring_] | ||||||
|  |         +cell #[code basestring] | ||||||
|  |         +cell #[code str] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.input_] | ||||||
|  |         +cell #[code raw_input] | ||||||
|  |         +cell #[code input] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.json_dumps] | ||||||
|  |         +cell #[code ujson.dumps] with #[code .decode('utf8')] | ||||||
|  |         +cell #[code ujson.dumps] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code compat.path2str] | ||||||
|  |         +cell #[code str(path)] with #[code .decode('utf8')] | ||||||
|  |         +cell #[code str(path)] | ||||||
|  | 
 | ||||||
|  | +h(3, "is_config") compat.is_config | ||||||
|  |     +tag function | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Check if a specific configuration of Python version and operating system | ||||||
|  |     |  matches the user's setup. Mostly used to display targeted error messages. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.compat import is_config | ||||||
|  | 
 | ||||||
|  |     if is_config(python2=True, windows=True): | ||||||
|  |         print("You are using Python 2 on Windows.") | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code python2] | ||||||
|  |         +cell bool | ||||||
|  |         +cell spaCy is executed with Python 2.x. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code python3] | ||||||
|  |         +cell bool | ||||||
|  |         +cell spaCy is executed with Python 3.x. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code windows] | ||||||
|  |         +cell bool | ||||||
|  |         +cell spaCy is executed on Windows. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code linux] | ||||||
|  |         +cell bool | ||||||
|  |         +cell spaCy is executed on Linux. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code osx] | ||||||
|  |         +cell bool | ||||||
|  |         +cell spaCy is executed on OS X or macOS. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell bool | ||||||
|  |         +cell Whether the specified configuration matches the user's platform. | ||||||
|  | @ -1,14 +1,12 @@ | ||||||
| //- 💫 DOCS > API > DISPLACY | //- 💫 DOCS > API > TOP-LEVEL > DISPLACY | ||||||
| 
 |  | ||||||
| include ../../_includes/_mixins |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  As of v2.0, spaCy comes with a built-in visualization suite. For more |     |  As of v2.0, spaCy comes with a built-in visualization suite. For more | ||||||
|     |  info and examples, see the usage guide on |     |  info and examples, see the usage guide on | ||||||
|     |  #[+a("/docs/usage/visualizers") visualizing spaCy]. |     |  #[+a("/usage/visualizers") visualizing spaCy]. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| +h(2, "serve") displacy.serve | +h(3, "displacy.serve") displacy.serve | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -60,7 +58,7 @@ p | ||||||
|         +cell bool |         +cell bool | ||||||
|         +cell |         +cell | ||||||
|             |  Don't parse #[code Doc] and instead, expect a dict or list of |             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||||
|             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] |             |  dicts. #[+a("/usage/visualizers#manual-usage") See here] | ||||||
|             |  for formats and examples. |             |  for formats and examples. | ||||||
|         +cell #[code False] |         +cell #[code False] | ||||||
| 
 | 
 | ||||||
|  | @ -70,7 +68,7 @@ p | ||||||
|         +cell Port to serve visualization. |         +cell Port to serve visualization. | ||||||
|         +cell #[code 5000] |         +cell #[code 5000] | ||||||
| 
 | 
 | ||||||
| +h(2, "render") displacy.render | +h(3, "displacy.render") displacy.render | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization. | ||||||
|         +cell bool |         +cell bool | ||||||
|         +cell |         +cell | ||||||
|             |  Don't parse #[code Doc] and instead, expect a dict or list of |             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||||
|             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] |             |  dicts. #[+a("/usage/visualizers#manual-usage") See here] | ||||||
|             |  for formats and examples. |             |  for formats and examples. | ||||||
|         +cell #[code False] |         +cell #[code False] | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell Rendered HTML markup. |         +cell Rendered HTML markup. | ||||||
|         +cell |         +cell | ||||||
| 
 | 
 | ||||||
| +h(2, "options") Visualizer options | +h(3, "displacy_options") Visualizer options | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  The #[code options] argument lets you specify additional settings for |     |  The #[code options] argument lets you specify additional settings for | ||||||
|     |  each visualizer. If a setting is not present in the options, the default |     |  each visualizer. If a setting is not present in the options, the default | ||||||
|     |  value will be used. |     |  value will be used. | ||||||
| 
 | 
 | ||||||
| +h(3, "options-dep") Dependency Visualizer options | +h(4, "options-dep") Dependency Visualizer options | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     options = {'compact': True, 'color': 'blue'} |     options = {'compact': True, 'color': 'blue'} | ||||||
|  | @ -219,7 +217,7 @@ p | ||||||
|         +cell Distance between words in px. |         +cell Distance between words in px. | ||||||
|         +cell #[code 175] / #[code 85] (compact) |         +cell #[code 175] / #[code 85] (compact) | ||||||
| 
 | 
 | ||||||
| +h(3, "options-ent") Named Entity Visualizer options | +h(4, "displacy_options-ent") Named Entity Visualizer options | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], |     options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], | ||||||
|  | @ -244,6 +242,6 @@ p | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  By default, displaCy comes with colours for all |     |  By default, displaCy comes with colours for all | ||||||
|     |  #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. |     |  #[+a("/api/annotation#named-entities") entity types supported by spaCy]. | ||||||
|     |  If you're using custom entity types, you can use the #[code colors] |     |  If you're using custom entity types, you can use the #[code colors] | ||||||
|     |  setting to add your own colours for them. |     |  setting to add your own colours for them. | ||||||
|  | @ -1,15 +1,13 @@ | ||||||
| //- 💫 DOCS > API > SPACY | //- 💫 DOCS > API > TOP-LEVEL > SPACY | ||||||
| 
 | 
 | ||||||
| include ../../_includes/_mixins | +h(3, "spacy.load") spacy.load | ||||||
| 
 |  | ||||||
| +h(2, "load") spacy.load |  | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-model |     +tag-model | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link], |     |  Load a model via its #[+a("/usage/models#usage") shortcut link], | ||||||
|     |  the name of an installed |     |  the name of an installed | ||||||
|     |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode |     |  #[+a("/usage/training#models-generating") model package], a unicode | ||||||
|     |  path or a #[code Path]-like object. spaCy will try resolving the load |     |  path or a #[code Path]-like object. spaCy will try resolving the load | ||||||
|     |  argument in this order. If a model is loaded from a shortcut link or |     |  argument in this order. If a model is loaded from a shortcut link or | ||||||
|     |  package name, spaCy will assume it's a Python package and import it and |     |  package name, spaCy will assume it's a Python package and import it and | ||||||
|  | @ -38,25 +36,57 @@ p | ||||||
|         +cell list |         +cell list | ||||||
|         +cell |         +cell | ||||||
|             |  Names of pipeline components to |             |  Names of pipeline components to | ||||||
|             |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. |             |  #[+a("/usage/processing-pipelines#disabling") disable]. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell A #[code Language] object with the loaded model. |         +cell A #[code Language] object with the loaded model. | ||||||
| 
 | 
 | ||||||
| +infobox("⚠️ Deprecation note") | +infobox("Deprecation note", "⚠️") | ||||||
|     .o-block |     .o-block | ||||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy |         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||||
|         |  will also raise an error if no model could be loaded and never just |         |  will also raise an error if no model could be loaded and never just | ||||||
|         |  return an empty #[code Language] object. If you need a blank language, |         |  return an empty #[code Language] object. If you need a blank language, | ||||||
|         |  you need to import it explicitly (#[code from spacy.lang.en import English]) |         |  you can use the new function #[+api("spacy#blank") #[code spacy.blank()]] | ||||||
|         |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. |         |  or import the class explicitly, e.g. | ||||||
|  |         |  #[code from spacy.lang.en import English]. | ||||||
| 
 | 
 | ||||||
|     +code-new nlp = spacy.load('/model') |     +code-new nlp = spacy.load('/model') | ||||||
|     +code-old nlp = spacy.load('en', path='/model') |     +code-old nlp = spacy.load('en', path='/model') | ||||||
| 
 | 
 | ||||||
| +h(2, "info") spacy.info | +h(3, "spacy.blank") spacy.blank | ||||||
|  |     +tag function | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Create a blank model of a given language class. This function is the | ||||||
|  |     |  twin of #[code spacy.load()]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     nlp_en = spacy.blank('en') | ||||||
|  |     nlp_de = spacy.blank('de') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell ISO code of the language class to load. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code disable] | ||||||
|  |         +cell list | ||||||
|  |         +cell | ||||||
|  |             |  Names of pipeline components to | ||||||
|  |             |  #[+a("/usage/processing-pipelines#disabling") disable]. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell #[code Language] | ||||||
|  |         +cell An empty #[code Language] object of the appropriate subclass. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(4, "spacy.info") spacy.info | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  | @ -83,13 +113,13 @@ p | ||||||
|         +cell Print information as Markdown. |         +cell Print information as Markdown. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| +h(2, "explain") spacy.explain | +h(3, "spacy.explain") spacy.explain | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Get a description for a given POS tag, dependency label or entity type. |     |  Get a description for a given POS tag, dependency label or entity type. | ||||||
|     |  For a list of available terms, see |     |  For a list of available terms, see | ||||||
|     |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py]. |     |  #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]]. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     spacy.explain('NORP') |     spacy.explain('NORP') | ||||||
|  | @ -107,18 +137,18 @@ p | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell Term to explain. |         +cell Term to explain. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell The explanation, or #[code None] if not found in the glossary. |         +cell The explanation, or #[code None] if not found in the glossary. | ||||||
| 
 | 
 | ||||||
| +h(2, "set_factory") spacy.set_factory | +h(3, "spacy.set_factory") spacy.set_factory | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Set a factory that returns a custom |     |  Set a factory that returns a custom | ||||||
|     |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] |     |  #[+a("/usage/processing-pipelines") processing pipeline] | ||||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. |     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|  | @ -1,10 +1,8 @@ | ||||||
| //- 💫 DOCS > API > UTIL | //- 💫 DOCS > API > TOP-LEVEL > UTIL | ||||||
| 
 |  | ||||||
| include ../../_includes/_mixins |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy comes with a small collection of utility functions located in |     |  spaCy comes with a small collection of utility functions located in | ||||||
|     |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. |     |  #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]]. | ||||||
|     |  Because utility functions are mostly intended for |     |  Because utility functions are mostly intended for | ||||||
|     |  #[strong internal use within spaCy], their behaviour may change with |     |  #[strong internal use within spaCy], their behaviour may change with | ||||||
|     |  future releases. The functions documented on this page should be safe |     |  future releases. The functions documented on this page should be safe | ||||||
|  | @ -12,7 +10,7 @@ p | ||||||
|     |  recommend having additional tests in place if your application depends on |     |  recommend having additional tests in place if your application depends on | ||||||
|     |  any of spaCy's utilities. |     |  any of spaCy's utilities. | ||||||
| 
 | 
 | ||||||
| +h(2, "get_data_path") util.get_data_path | +h(3, "util.get_data_path") util.get_data_path | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  | @ -25,12 +23,12 @@ p | ||||||
|         +cell bool |         +cell bool | ||||||
|         +cell Only return path if it exists, otherwise return #[code None]. |         +cell Only return path if it exists, otherwise return #[code None]. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Path] / #[code None] |         +cell #[code Path] / #[code None] | ||||||
|         +cell Data path or #[code None]. |         +cell Data path or #[code None]. | ||||||
| 
 | 
 | ||||||
| +h(2, "set_data_path") util.set_data_path | +h(3, "util.set_data_path") util.set_data_path | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|  | @ -47,12 +45,12 @@ p | ||||||
|         +cell unicode or #[code Path] |         +cell unicode or #[code Path] | ||||||
|         +cell Path to new data directory. |         +cell Path to new data directory. | ||||||
| 
 | 
 | ||||||
| +h(2, "get_lang_class") util.get_lang_class | +h(3, "util.get_lang_class") util.get_lang_class | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Import and load a #[code Language] class. Allows lazy-loading |     |  Import and load a #[code Language] class. Allows lazy-loading | ||||||
|     |  #[+a("/docs/usage/adding-languages") language data] and importing |     |  #[+a("/usage/adding-languages") language data] and importing | ||||||
|     |  languages using the two-letter language code. |     |  languages using the two-letter language code. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|  | @ -67,12 +65,12 @@ p | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell Two-letter language code, e.g. #[code 'en']. |         +cell Two-letter language code, e.g. #[code 'en']. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell Language class. |         +cell Language class. | ||||||
| 
 | 
 | ||||||
| +h(2, "load_model") util.load_model | +h(3, "util.load_model") util.load_model | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -101,12 +99,12 @@ p | ||||||
|         +cell - |         +cell - | ||||||
|         +cell Specific overrides, like pipeline components to disable. |         +cell Specific overrides, like pipeline components to disable. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell #[code Language] class with the loaded model. |         +cell #[code Language] class with the loaded model. | ||||||
| 
 | 
 | ||||||
| +h(2, "load_model_from_path") util.load_model_from_path | +h(3, "util.load_model_from_path") util.load_model_from_path | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -139,18 +137,18 @@ p | ||||||
|         +cell - |         +cell - | ||||||
|         +cell Specific overrides, like pipeline components to disable. |         +cell Specific overrides, like pipeline components to disable. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell #[code Language] class with the loaded model. |         +cell #[code Language] class with the loaded model. | ||||||
| 
 | 
 | ||||||
| +h(2, "load_model_from_init_py") util.load_model_from_init_py | +h(3, "util.load_model_from_init_py") util.load_model_from_init_py | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  A helper function to use in the #[code load()] method of a model package's |     |  A helper function to use in the #[code load()] method of a model package's | ||||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. |     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     from spacy.util import load_model_from_init_py |     from spacy.util import load_model_from_init_py | ||||||
|  | @ -169,12 +167,12 @@ p | ||||||
|         +cell - |         +cell - | ||||||
|         +cell Specific overrides, like pipeline components to disable. |         +cell Specific overrides, like pipeline components to disable. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell #[code Language] class with the loaded model. |         +cell #[code Language] class with the loaded model. | ||||||
| 
 | 
 | ||||||
| +h(2, "get_model_meta") util.get_model_meta | +h(3, "util.get_model_meta") util.get_model_meta | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -190,17 +188,17 @@ p | ||||||
|         +cell unicode or #[code Path] |         +cell unicode or #[code Path] | ||||||
|         +cell Path to model directory. |         +cell Path to model directory. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell dict |         +cell dict | ||||||
|         +cell The model's meta data. |         +cell The model's meta data. | ||||||
| 
 | 
 | ||||||
| +h(2, "is_package") util.is_package | +h(3, "util.is_package") util.is_package | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Check if string maps to a package installed via pip. Mainly used to |     |  Check if string maps to a package installed via pip. Mainly used to | ||||||
|     |  validate #[+a("/docs/usage/models") model packages]. |     |  validate #[+a("/usage/models") model packages]. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     util.is_package('en_core_web_sm') # True |     util.is_package('en_core_web_sm') # True | ||||||
|  | @ -212,18 +210,18 @@ p | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell Name of package. |         +cell Name of package. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code bool] |         +cell #[code bool] | ||||||
|         +cell #[code True] if installed package, #[code False] if not. |         +cell #[code True] if installed package, #[code False] if not. | ||||||
| 
 | 
 | ||||||
| +h(2, "get_package_path") util.get_package_path | +h(3, "util.get_package_path") util.get_package_path | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Get path to an installed package. Mainly used to resolve the location of |     |  Get path to an installed package. Mainly used to resolve the location of | ||||||
|     |  #[+a("/docs/usage/models") model packages]. Currently imports the package |     |  #[+a("/usage/models") model packages]. Currently imports the package | ||||||
|     |  to find its path. |     |  to find its path. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|  | @ -236,12 +234,12 @@ p | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell Name of installed package. |         +cell Name of installed package. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell #[code Path] |         +cell #[code Path] | ||||||
|         +cell Path to model package directory. |         +cell Path to model package directory. | ||||||
| 
 | 
 | ||||||
| +h(2, "is_in_jupyter") util.is_in_jupyter | +h(3, "util.is_in_jupyter") util.is_in_jupyter | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
|  | @ -257,17 +255,17 @@ p | ||||||
|         return display(HTML(html)) |         return display(HTML(html)) | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell bool |         +cell bool | ||||||
|         +cell #[code True] if in Jupyter, #[code False] if not. |         +cell #[code True] if in Jupyter, #[code False] if not. | ||||||
| 
 | 
 | ||||||
| +h(2, "update_exc") util.update_exc | +h(3, "util.update_exc") util.update_exc | ||||||
|     +tag function |     +tag function | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Update, validate and overwrite |     |  Update, validate and overwrite | ||||||
|     |  #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. |     |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. | ||||||
|     |  Used to combine global  exceptions with custom, language-specific |     |  Used to combine global  exceptions with custom, language-specific | ||||||
|     |  exceptions. Will raise an error if key doesn't match #[code ORTH] values. |     |  exceptions. Will raise an error if key doesn't match #[code ORTH] values. | ||||||
| 
 | 
 | ||||||
|  | @ -288,20 +286,20 @@ p | ||||||
|         +cell dicts |         +cell dicts | ||||||
|         +cell Exception dictionaries to add to the base exceptions, in order. |         +cell Exception dictionaries to add to the base exceptions, in order. | ||||||
| 
 | 
 | ||||||
|     +footrow |     +row("foot") | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell dict |         +cell dict | ||||||
|         +cell Combined tokenizer exceptions. |         +cell Combined tokenizer exceptions. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| +h(2, "prints") util.prints | +h(3, "util.prints") util.prints | ||||||
|     +tag function |     +tag function | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Print a formatted, text-wrapped message with optional title. If a text |     |  Print a formatted, text-wrapped message with optional title. If a text | ||||||
|     |  argument is a #[code Path], it's converted to a string. Should only |     |  argument is a #[code Path], it's converted to a string. Should only | ||||||
|     |  be used for interactive components like the #[+api("cli") cli]. |     |  be used for interactive components like the command-line interface. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     data_path = Path('/some/path') |     data_path = Path('/some/path') | ||||||
							
								
								
									
										131
									
								
								website/api/annotation.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								website/api/annotation.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,131 @@ | ||||||
|  | //- 💫 DOCS > API > ANNOTATION SPECS | ||||||
|  | 
 | ||||||
|  | include ../_includes/_mixins | ||||||
|  | 
 | ||||||
|  | p This document describes the target annotations spaCy is trained to predict. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +section("tokenization") | ||||||
|  |     +h(2, "tokenization") Tokenization | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  Tokenization standards are based on the | ||||||
|  |         |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus. | ||||||
|  |         |  The tokenizer differs from most by including tokens for significant | ||||||
|  |         |  whitespace. Any sequence of whitespace characters beyond a single space | ||||||
|  |         |  (#[code ' ']) is included as a token. | ||||||
|  | 
 | ||||||
|  |     +aside-code("Example"). | ||||||
|  |         from spacy.lang.en import English | ||||||
|  |         nlp = English() | ||||||
|  |         tokens = nlp('Some\nspaces  and\ttab characters') | ||||||
|  |         tokens_text = [t.text for t in tokens] | ||||||
|  |         assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', | ||||||
|  |                             '\t', 'tab', 'characters'] | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  The whitespace tokens are useful for much the same reason punctuation is | ||||||
|  |         |  – it's often an important delimiter in the text. By preserving it in the | ||||||
|  |         |  token output, we are able to maintain a simple alignment between the | ||||||
|  |         |  tokens and the original string, and we ensure that no information is | ||||||
|  |         |  lost during processing. | ||||||
|  | 
 | ||||||
|  | +section("sbd") | ||||||
|  |     +h(2, "sentence-boundary") Sentence boundary detection | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  Sentence boundaries are calculated from the syntactic parse tree, so | ||||||
|  |         |  features such as punctuation and capitalisation play an important but | ||||||
|  |         |  non-decisive role in determining the sentence boundaries. Usually this | ||||||
|  |         |  means that the sentence boundaries will at least coincide with clause | ||||||
|  |         |  boundaries, even given poorly punctuated text. | ||||||
|  | 
 | ||||||
|  | +section("pos-tagging") | ||||||
|  |     +h(2, "pos-tagging") Part-of-speech Tagging | ||||||
|  | 
 | ||||||
|  |     +aside("Tip: Understanding tags") | ||||||
|  |         |  You can also use #[code spacy.explain()] to get the description for the | ||||||
|  |         |  string representation of a tag. For example, | ||||||
|  |         |  #[code spacy.explain("RB")] will return "adverb". | ||||||
|  | 
 | ||||||
|  |     include _annotation/_pos-tags | ||||||
|  | 
 | ||||||
|  | +section("lemmatization") | ||||||
|  |     +h(2, "lemmatization") Lemmatization | ||||||
|  | 
 | ||||||
|  |     p A "lemma" is the uninflected form of a word. In English, this means: | ||||||
|  | 
 | ||||||
|  |     +list | ||||||
|  |         +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest" | ||||||
|  |         +item #[strong Adverbs]: The form like "badly", not "worse" or "worst" | ||||||
|  |         +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" | ||||||
|  |         +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" | ||||||
|  | 
 | ||||||
|  |     p | ||||||
|  |         |  The lemmatization data is taken from | ||||||
|  |         |  #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a | ||||||
|  |         |  special case for pronouns: all pronouns are lemmatized to the special | ||||||
|  |         |  token #[code -PRON-]. | ||||||
|  | 
 | ||||||
|  |     +infobox("About spaCy's custom pronoun lemma") | ||||||
|  |         |  Unlike verbs and common nouns, there's no clear base form of a personal | ||||||
|  |         |  pronoun. Should the lemma of "me" be "I", or should we normalize person | ||||||
|  |         |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a | ||||||
|  |         |  novel symbol, #[code -PRON-], which is used as the lemma for | ||||||
|  |         |  all personal pronouns. | ||||||
|  | 
 | ||||||
|  | +section("dependency-parsing") | ||||||
|  |     +h(2, "dependency-parsing") Syntactic Dependency Parsing | ||||||
|  | 
 | ||||||
|  |     +aside("Tip: Understanding labels") | ||||||
|  |         |  You can also use #[code spacy.explain()] to get the description for the | ||||||
|  |         |  string representation of a label. For example, | ||||||
|  |         |  #[code spacy.explain("prt")] will return "particle". | ||||||
|  | 
 | ||||||
|  |     include _annotation/_dep-labels | ||||||
|  | 
 | ||||||
|  | +section("named-entities") | ||||||
|  |     +h(2, "named-entities") Named Entity Recognition | ||||||
|  | 
 | ||||||
|  |     +aside("Tip: Understanding entity types") | ||||||
|  |         |  You can also use #[code spacy.explain()] to get the description for the | ||||||
|  |         |  string representation of an entity label. For example, | ||||||
|  |         |  #[code spacy.explain("LANGUAGE")] will return "any named language". | ||||||
|  | 
 | ||||||
|  |     include _annotation/_named-entities | ||||||
|  | 
 | ||||||
|  |     +h(3, "biluo") BILUO Scheme | ||||||
|  | 
 | ||||||
|  |     include _annotation/_biluo | ||||||
|  | 
 | ||||||
|  | +section("training") | ||||||
|  |     +h(2, "json-input") JSON input format for training | ||||||
|  | 
 | ||||||
|  |     +under-construction | ||||||
|  | 
 | ||||||
|  |     p spaCy takes training data in the following format: | ||||||
|  | 
 | ||||||
|  |     +code("Example structure"). | ||||||
|  |         doc: { | ||||||
|  |             id: string, | ||||||
|  |             paragraphs: [{ | ||||||
|  |                 raw: string, | ||||||
|  |                 sents: [int], | ||||||
|  |                 tokens: [{ | ||||||
|  |                     start: int, | ||||||
|  |                     tag: string, | ||||||
|  |                     head: int, | ||||||
|  |                     dep: string | ||||||
|  |                 }], | ||||||
|  |                 ner: [{ | ||||||
|  |                     start: int, | ||||||
|  |                     end: int, | ||||||
|  |                     label: string | ||||||
|  |                 }], | ||||||
|  |                 brackets: [{ | ||||||
|  |                     start: int, | ||||||
|  |                     end: int, | ||||||
|  |                     label: string | ||||||
|  |                 }] | ||||||
|  |             }] | ||||||
|  |         } | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| //- 💫 DOCS > API > BINDER | //- 💫 DOCS > API > BINDER | ||||||
| 
 | 
 | ||||||
| include ../../_includes/_mixins | include ../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p A container class for serializing collections of #[code Doc] objects. | p A container class for serializing collections of #[code Doc] objects. | ||||||
| 
 | 
 | ||||||
							
								
								
									
										5
									
								
								website/api/dependencyparser.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								website/api/dependencyparser.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | ||||||
|  | //- 💫 DOCS > API > DEPENDENCYPARSER | ||||||
|  | 
 | ||||||
|  | include ../_includes/_mixins | ||||||
|  | 
 | ||||||
|  | !=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" }) | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user