mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Wrap try/except around model saving
This commit is contained in:
		
						commit
						c6cd81f192
					
				|  | @ -1 +1,55 @@ | |||
| environment: | ||||
| 
 | ||||
|   matrix: | ||||
| 
 | ||||
|     # For Python versions available on Appveyor, see | ||||
|     # http://www.appveyor.com/docs/installed-software#python | ||||
|     # The list here is complete (excluding Python 2.6, which | ||||
|     # isn't covered by this document) at the time of writing. | ||||
| 
 | ||||
|     - PYTHON: "C:\\Python27" | ||||
|     #- PYTHON: "C:\\Python33" | ||||
|     #- PYTHON: "C:\\Python34" | ||||
|     #- PYTHON: "C:\\Python35" | ||||
|     #- PYTHON: "C:\\Python27-x64" | ||||
|     #- PYTHON: "C:\\Python33-x64" | ||||
|     #- DISTUTILS_USE_SDK: "1" | ||||
|     #- PYTHON: "C:\\Python34-x64" | ||||
|     #- DISTUTILS_USE_SDK: "1" | ||||
|     #- PYTHON: "C:\\Python35-x64" | ||||
|     - PYTHON: "C:\\Python36-x64" | ||||
| 
 | ||||
| install: | ||||
|   # We need wheel installed to build wheels | ||||
|   - "%PYTHON%\\python.exe -m pip install wheel" | ||||
|   - "%PYTHON%\\python.exe -m pip install cython" | ||||
|   - "%PYTHON%\\python.exe -m pip install -r requirements.txt" | ||||
|   - "%PYTHON%\\python.exe -m pip install -e ." | ||||
| 
 | ||||
| build: off | ||||
| 
 | ||||
| test_script: | ||||
|   # Put your test command here. | ||||
|   # If you don't need to build C extensions on 64-bit Python 3.3 or 3.4, | ||||
|   # you can remove "build.cmd" from the front of the command, as it's | ||||
|   # only needed to support those cases. | ||||
|   # Note that you must use the environment variable %PYTHON% to refer to | ||||
|   # the interpreter you're using - Appveyor does not do anything special | ||||
|   # to put the Python version you want to use on PATH. | ||||
|   - "%PYTHON%\\python.exe -m pytest spacy/" | ||||
| 
 | ||||
| after_test: | ||||
|   # This step builds your wheels. | ||||
|   # Again, you only need build.cmd if you're building C extensions for | ||||
|   # 64-bit Python 3.3/3.4. And you need to use %PYTHON% to get the correct | ||||
|   # interpreter | ||||
|   - "%PYTHON%\\python.exe setup.py bdist_wheel" | ||||
| 
 | ||||
| artifacts: | ||||
|   # bdist_wheel puts your built wheel in the dist directory | ||||
|   - path: dist\* | ||||
| 
 | ||||
| #on_success: | ||||
| #  You can use this step to upload your artifacts to a public website. | ||||
| #  See Appveyor's documentation for more details. Or you can simply | ||||
| #  access your wheels from the Appveyor "artifacts" tab for your build. | ||||
|  |  | |||
							
								
								
									
										11
									
								
								.buildkite/sdist.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								.buildkite/sdist.yml
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,11 @@ | |||
| steps: | ||||
|   - | ||||
|     command: "fab env clean make test sdist" | ||||
|     label: ":dizzy: :python:" | ||||
|     artifact_paths: "dist/*.tar.gz" | ||||
|   - wait | ||||
|   - trigger: "spacy-sdist-against-models" | ||||
|     label: ":dizzy: :hammer:" | ||||
|     build: | ||||
|       env: | ||||
|         SPACY_VERSION: "{$SPACY_VERSION}" | ||||
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							|  | @ -1,14 +1,12 @@ | |||
| # spaCy | ||||
| spacy/data/ | ||||
| corpora/ | ||||
| models/ | ||||
| /models/ | ||||
| keys/ | ||||
| 
 | ||||
| # Website | ||||
| website/www/ | ||||
| website/_deploy.sh | ||||
| website/package.json | ||||
| website/announcement.jade | ||||
| website/.gitignore | ||||
| 
 | ||||
| # Cython / C extensions | ||||
|  |  | |||
|  | @ -1,322 +0,0 @@ | |||
| '''WIP --- Doesn't work well yet''' | ||||
| import plac | ||||
| import random | ||||
| import six | ||||
| 
 | ||||
| import cProfile | ||||
| import pstats | ||||
| 
 | ||||
| import pathlib | ||||
| import cPickle as pickle | ||||
| from itertools import izip | ||||
| 
 | ||||
| import spacy | ||||
| 
 | ||||
| import cytoolz | ||||
| import cupy as xp | ||||
| import cupy.cuda | ||||
| import chainer.cuda | ||||
| 
 | ||||
| import chainer.links as L | ||||
| import chainer.functions as F | ||||
| from chainer import Chain, Variable, report | ||||
| import chainer.training | ||||
| import chainer.optimizers | ||||
| from chainer.training import extensions | ||||
| from chainer.iterators import SerialIterator | ||||
| from chainer.datasets import TupleDataset | ||||
| 
 | ||||
| 
 | ||||
| class SentimentAnalyser(object): | ||||
|     @classmethod | ||||
|     def load(cls, path, nlp, max_length=100): | ||||
|         raise NotImplementedError | ||||
|         #with (path / 'config.json').open() as file_: | ||||
|         #    model = model_from_json(file_.read()) | ||||
|         #with (path / 'model').open('rb') as file_: | ||||
|         #    lstm_weights = pickle.load(file_) | ||||
|         #embeddings = get_embeddings(nlp.vocab) | ||||
|         #model.set_weights([embeddings] + lstm_weights) | ||||
|         #return cls(model, max_length=max_length) | ||||
| 
 | ||||
|     def __init__(self, model, max_length=100): | ||||
|         self._model = model | ||||
|         self.max_length = max_length | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         X = get_features([doc], self.max_length) | ||||
|         y = self._model.predict(X) | ||||
|         self.set_sentiment(doc, y) | ||||
| 
 | ||||
|     def pipe(self, docs, batch_size=1000, n_threads=2): | ||||
|         for minibatch in cytoolz.partition_all(batch_size, docs): | ||||
|             minibatch = list(minibatch) | ||||
|             sentences = [] | ||||
|             for doc in minibatch: | ||||
|                 sentences.extend(doc.sents) | ||||
|             Xs = get_features(sentences, self.max_length) | ||||
|             ys = self._model.predict(Xs) | ||||
|             for sent, label in zip(sentences, ys): | ||||
|                 sent.doc.sentiment += label - 0.5 | ||||
|             for doc in minibatch: | ||||
|                 yield doc | ||||
| 
 | ||||
|     def set_sentiment(self, doc, y): | ||||
|         doc.sentiment = float(y[0]) | ||||
|         # Sentiment has a native slot for a single float. | ||||
|         # For arbitrary data storage, there's: | ||||
|         # doc.user_data['my_data'] = y | ||||
| 
 | ||||
| 
 | ||||
| class Classifier(Chain): | ||||
|     def __init__(self, predictor): | ||||
|         super(Classifier, self).__init__(predictor=predictor) | ||||
| 
 | ||||
|     def __call__(self, x, t): | ||||
|         y = self.predictor(x) | ||||
|         loss = F.softmax_cross_entropy(y, t) | ||||
|         accuracy = F.accuracy(y, t) | ||||
|         report({'loss': loss, 'accuracy': accuracy}, self) | ||||
|         return loss | ||||
| 
 | ||||
| 
 | ||||
| class SentimentModel(Chain): | ||||
|     def __init__(self, nlp, shape, **settings): | ||||
|         Chain.__init__(self, | ||||
|             embed=_Embed(shape['nr_vector'], shape['nr_dim'], shape['nr_hidden'], | ||||
|                 set_vectors=lambda arr: set_vectors(arr, nlp.vocab)), | ||||
|             encode=_Encode(shape['nr_hidden'], shape['nr_hidden']), | ||||
|             attend=_Attend(shape['nr_hidden'], shape['nr_hidden']), | ||||
|             predict=_Predict(shape['nr_hidden'], shape['nr_class'])) | ||||
|         self.to_gpu(0) | ||||
| 
 | ||||
|     def __call__(self, sentence): | ||||
|         return self.predict( | ||||
|                   self.attend( | ||||
|                       self.encode( | ||||
|                           self.embed(sentence)))) | ||||
| 
 | ||||
| 
 | ||||
| class _Embed(Chain): | ||||
|     def __init__(self, nr_vector, nr_dim, nr_out, set_vectors=None): | ||||
|         Chain.__init__(self, | ||||
|             embed=L.EmbedID(nr_vector, nr_dim, initialW=set_vectors), | ||||
|             project=L.Linear(None, nr_out, nobias=True)) | ||||
|         self.embed.W.volatile = False | ||||
| 
 | ||||
|     def __call__(self, sentence): | ||||
|         return [self.project(self.embed(ts)) for ts in F.transpose(sentence)] | ||||
| 
 | ||||
| 
 | ||||
| class _Encode(Chain): | ||||
|     def __init__(self, nr_in, nr_out): | ||||
|         Chain.__init__(self, | ||||
|             fwd=L.LSTM(nr_in, nr_out), | ||||
|             bwd=L.LSTM(nr_in, nr_out), | ||||
|             mix=L.Bilinear(nr_out, nr_out, nr_out)) | ||||
| 
 | ||||
|     def __call__(self, sentence): | ||||
|         self.fwd.reset_state() | ||||
|         fwds = map(self.fwd, sentence) | ||||
|         self.bwd.reset_state() | ||||
|         bwds = reversed(map(self.bwd, reversed(sentence))) | ||||
|         return [F.elu(self.mix(f, b)) for f, b in zip(fwds, bwds)] | ||||
| 
 | ||||
| 
 | ||||
| class _Attend(Chain): | ||||
|     def __init__(self, nr_in, nr_out): | ||||
|         Chain.__init__(self) | ||||
| 
 | ||||
|     def __call__(self, sentence): | ||||
|         sent = sum(sentence) | ||||
|         return sent | ||||
| 
 | ||||
| 
 | ||||
| class _Predict(Chain): | ||||
|     def __init__(self, nr_in, nr_out): | ||||
|         Chain.__init__(self, | ||||
|             l1=L.Linear(nr_in, nr_in), | ||||
|             l2=L.Linear(nr_in, nr_out)) | ||||
| 
 | ||||
|     def __call__(self, vector): | ||||
|         vector = self.l1(vector) | ||||
|         vector = F.elu(vector) | ||||
|         vector = self.l2(vector) | ||||
|         return vector | ||||
| 
 | ||||
| 
 | ||||
| class SentenceDataset(TupleDataset): | ||||
|     def __init__(self, nlp, texts, labels, max_length): | ||||
|         self.max_length = max_length | ||||
|         sents, labels = self._get_labelled_sentences( | ||||
|             nlp.pipe(texts, batch_size=5000, n_threads=3), | ||||
|             labels) | ||||
|         TupleDataset.__init__(self, | ||||
|             get_features(sents, max_length), | ||||
|             labels) | ||||
| 
 | ||||
|     def __getitem__(self, index): | ||||
|         batches = [dataset[index] for dataset in self._datasets] | ||||
|         if isinstance(index, slice): | ||||
|             length = len(batches[0]) | ||||
|             returns = [tuple([batch[i] for batch in batches]) | ||||
|                        for i in six.moves.range(length)] | ||||
|             return returns | ||||
|         else: | ||||
|             return tuple(batches) | ||||
| 
 | ||||
|     def _get_labelled_sentences(self, docs, doc_labels): | ||||
|         labels = [] | ||||
|         sentences = [] | ||||
|         for doc, y in izip(docs, doc_labels): | ||||
|             for sent in doc.sents: | ||||
|                 sentences.append(sent) | ||||
|                 labels.append(y) | ||||
|         return sentences, xp.asarray(labels, dtype='i') | ||||
| 
 | ||||
| 
 | ||||
| class DocDataset(TupleDataset): | ||||
|     def __init__(self, nlp, texts, labels): | ||||
|         self.max_length = max_length | ||||
|         DatasetMixin.__init__(self, | ||||
|             get_features( | ||||
|                 nlp.pipe(texts, batch_size=5000, n_threads=3), self.max_length), | ||||
|             labels) | ||||
| 
 | ||||
| def read_data(data_dir, limit=0): | ||||
|     examples = [] | ||||
|     for subdir, label in (('pos', 1), ('neg', 0)): | ||||
|         for filename in (data_dir / subdir).iterdir(): | ||||
|             with filename.open() as file_: | ||||
|                 text = file_.read() | ||||
|             examples.append((text, label)) | ||||
|     random.shuffle(examples) | ||||
|     if limit >= 1: | ||||
|         examples = examples[:limit] | ||||
|     return zip(*examples) # Unzips into two lists | ||||
| 
 | ||||
| 
 | ||||
| def get_features(docs, max_length): | ||||
|     docs = list(docs) | ||||
|     Xs = xp.zeros((len(docs), max_length), dtype='i') | ||||
|     for i, doc in enumerate(docs): | ||||
|         j = 0 | ||||
|         for token in doc: | ||||
|             if token.has_vector and not token.is_punct and not token.is_space: | ||||
|                 Xs[i, j] = token.norm | ||||
|                 j += 1 | ||||
|                 if j >= max_length: | ||||
|                     break | ||||
|     return Xs | ||||
| 
 | ||||
| 
 | ||||
| def set_vectors(vectors, vocab): | ||||
|     for lex in vocab: | ||||
|         if lex.has_vector and (lex.rank+1) < vectors.shape[0]: | ||||
|             lex.norm = lex.rank+1 | ||||
|             vectors[lex.rank + 1] = lex.vector | ||||
|         else: | ||||
|             lex.norm = 0 | ||||
|     return vectors | ||||
| 
 | ||||
| 
 | ||||
| def train(train_texts, train_labels, dev_texts, dev_labels, | ||||
|         lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, | ||||
|         by_sentence=True): | ||||
|     nlp = spacy.load('en', entity=False) | ||||
|     if 'nr_vector' not in lstm_shape: | ||||
|         lstm_shape['nr_vector'] = max(lex.rank+1 for lex in nlp.vocab if lex.has_vector) | ||||
|     if 'nr_dim' not in lstm_shape: | ||||
|         lstm_shape['nr_dim'] = nlp.vocab.vectors_length | ||||
|     print("Make model") | ||||
|     model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) | ||||
|     print("Parsing texts...") | ||||
|     if by_sentence: | ||||
|         train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) | ||||
|         dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) | ||||
|     else: | ||||
|         train_data = DocDataset(nlp, train_texts, train_labels) | ||||
|         dev_data = DocDataset(nlp, dev_texts, dev_labels) | ||||
|     train_iter = SerialIterator(train_data, batch_size=batch_size, | ||||
|                                 shuffle=True, repeat=True) | ||||
|     dev_iter = SerialIterator(dev_data, batch_size=batch_size, | ||||
|                               shuffle=False, repeat=False) | ||||
|     optimizer = chainer.optimizers.Adam() | ||||
|     optimizer.setup(model) | ||||
|     updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) | ||||
|     trainer = chainer.training.Trainer(updater, (1, 'epoch'), out='result') | ||||
| 
 | ||||
|     trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) | ||||
|     trainer.extend(extensions.LogReport()) | ||||
|     trainer.extend(extensions.PrintReport([ | ||||
|         'epoch', 'main/accuracy', 'validation/main/accuracy'])) | ||||
|     trainer.extend(extensions.ProgressBar()) | ||||
|      | ||||
|     trainer.run() | ||||
| 
 | ||||
| 
 | ||||
| def evaluate(model_dir, texts, labels, max_length=100): | ||||
|     def create_pipeline(nlp): | ||||
|         ''' | ||||
|         This could be a lambda, but named functions are easier to read in Python. | ||||
|         ''' | ||||
|         return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp, | ||||
|                                                                max_length=max_length)] | ||||
|      | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline = create_pipeline(nlp) | ||||
| 
 | ||||
|     correct = 0 | ||||
|     i = 0  | ||||
|     for doc in nlp.pipe(texts, batch_size=1000, n_threads=4): | ||||
|         correct += bool(doc.sentiment >= 0.5) == bool(labels[i]) | ||||
|         i += 1 | ||||
|     return float(correct) / i | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     train_dir=("Location of training file or directory"), | ||||
|     dev_dir=("Location of development file or directory"), | ||||
|     model_dir=("Location of output model directory",), | ||||
|     is_runtime=("Demonstrate run-time usage", "flag", "r", bool), | ||||
|     nr_hidden=("Number of hidden units", "option", "H", int), | ||||
|     max_length=("Maximum sentence length", "option", "L", int), | ||||
|     dropout=("Dropout", "option", "d", float), | ||||
|     learn_rate=("Learn rate", "option", "e", float), | ||||
|     nb_epoch=("Number of training epochs", "option", "i", int), | ||||
|     batch_size=("Size of minibatches for training LSTM", "option", "b", int), | ||||
|     nr_examples=("Limit to N examples", "option", "n", int) | ||||
| ) | ||||
| def main(model_dir, train_dir, dev_dir, | ||||
|          is_runtime=False, | ||||
|          nr_hidden=64, max_length=100, # Shape | ||||
|          dropout=0.5, learn_rate=0.001, # General NN config | ||||
|          nb_epoch=5, batch_size=32, nr_examples=-1):  # Training params | ||||
|     model_dir = pathlib.Path(model_dir) | ||||
|     train_dir = pathlib.Path(train_dir) | ||||
|     dev_dir = pathlib.Path(dev_dir) | ||||
|     if is_runtime: | ||||
|         dev_texts, dev_labels = read_data(dev_dir) | ||||
|         acc = evaluate(model_dir, dev_texts, dev_labels, max_length=max_length) | ||||
|         print(acc) | ||||
|     else: | ||||
|         print("Read data") | ||||
|         train_texts, train_labels = read_data(train_dir, limit=nr_examples) | ||||
|         dev_texts, dev_labels = read_data(dev_dir, limit=nr_examples) | ||||
|         print("Using GPU 0") | ||||
|         #chainer.cuda.get_device(0).use() | ||||
|         train_labels = xp.asarray(train_labels, dtype='i') | ||||
|         dev_labels = xp.asarray(dev_labels, dtype='i') | ||||
|         lstm = train(train_texts, train_labels, dev_texts, dev_labels, | ||||
|                      {'nr_hidden': nr_hidden, 'max_length': max_length, 'nr_class': 2, | ||||
|                       'nr_vector': 5000}, | ||||
|                       {'dropout': 0.5, 'lr': learn_rate}, | ||||
|                       {}, | ||||
|                       nb_epoch=nb_epoch, batch_size=batch_size) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     #cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") | ||||
|     #s = pstats.Stats("Profile.prof") | ||||
|     #s.strip_dirs().sort_stats("time").print_stats() | ||||
|     plac.call(main) | ||||
|  | @ -20,71 +20,71 @@ The algorithm is O(n) at run-time for document of length n because we're only ev | |||
| matching over the tag patterns. So no matter how many phrases we're looking for, | ||||
| our pattern set stays very small (exact size depends on the maximum length we're | ||||
| looking for, as the query language currently has no quantifiers) | ||||
| 
 | ||||
| The example expects a .bz2 file from the Reddit corpus, and a patterns file, | ||||
| formatted in jsonl as a sequence of entries like this: | ||||
| 
 | ||||
| {"text":"Anchorage"} | ||||
| {"text":"Angola"} | ||||
| {"text":"Ann Arbor"} | ||||
| {"text":"Annapolis"} | ||||
| {"text":"Appalachia"} | ||||
| {"text":"Argentina"} | ||||
| """ | ||||
| from __future__ import print_function, unicode_literals, division | ||||
| from ast import literal_eval | ||||
| from bz2 import BZ2File | ||||
| import time | ||||
| import math | ||||
| import codecs | ||||
| 
 | ||||
| import plac | ||||
| import ujson | ||||
| 
 | ||||
| from preshed.maps import PreshMap | ||||
| from preshed.counter import PreshCounter | ||||
| from spacy.strings import hash_string | ||||
| from spacy.en import English | ||||
| from spacy.matcher import PhraseMatcher | ||||
| import spacy | ||||
| 
 | ||||
| 
 | ||||
| def read_gazetteer(tokenizer, loc, n=-1): | ||||
|     for i, line in enumerate(open(loc)): | ||||
|         phrase = literal_eval('u' + line.strip()) | ||||
|         if ' (' in phrase and phrase.endswith(')'): | ||||
|             phrase = phrase.split(' (', 1)[0] | ||||
|         if i >= n: | ||||
|             break | ||||
|         phrase = tokenizer(phrase) | ||||
|         if all((t.is_lower and t.prob >= -10) for t in phrase): | ||||
|             continue | ||||
|         data = ujson.loads(line.strip()) | ||||
|         phrase = tokenizer(data['text']) | ||||
|         for w in phrase: | ||||
|             _ = tokenizer.vocab[w.text] | ||||
|         if len(phrase) >= 2: | ||||
|             yield phrase | ||||
| 
 | ||||
| 
 | ||||
| def read_text(bz2_loc): | ||||
| def read_text(bz2_loc, n=10000): | ||||
|     with BZ2File(bz2_loc) as file_: | ||||
|         for line in file_: | ||||
|             yield line.decode('utf8') | ||||
|         for i, line in enumerate(file_): | ||||
|             data = ujson.loads(line) | ||||
|             yield data['body'] | ||||
|             if i >= n: | ||||
|                 break | ||||
| 
 | ||||
| 
 | ||||
| def get_matches(tokenizer, phrases, texts, max_length=6): | ||||
|     matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) | ||||
|     print("Match") | ||||
|     matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) | ||||
|     matcher.add('Phrase', None, *phrases) | ||||
|     for text in texts: | ||||
|         doc = tokenizer(text) | ||||
|         for w in doc: | ||||
|             _ = doc.vocab[w.text] | ||||
|         matches = matcher(doc) | ||||
|         for mwe in doc.ents: | ||||
|             yield mwe | ||||
|         for ent_id, start, end in matches: | ||||
|             yield (ent_id, doc[start:end].text) | ||||
| 
 | ||||
| 
 | ||||
| def main(patterns_loc, text_loc, counts_loc, n=10000000): | ||||
|     nlp = English(parser=False, tagger=False, entity=False) | ||||
|     print("Make matcher") | ||||
|     phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) | ||||
|     counts = PreshCounter() | ||||
| def main(patterns_loc, text_loc, n=10000): | ||||
|     nlp = spacy.blank('en') | ||||
|     nlp.vocab.lex_attr_getters = {} | ||||
|     phrases = read_gazetteer(nlp.tokenizer, patterns_loc) | ||||
|     count = 0 | ||||
|     t1 = time.time() | ||||
|     for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): | ||||
|         counts.inc(hash_string(mwe.text), 1) | ||||
|     for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): | ||||
|         count += 1 | ||||
|     t2 = time.time() | ||||
|     print("10m tokens in %d s" % (t2 - t1)) | ||||
|      | ||||
|     with codecs.open(counts_loc, 'w', 'utf8') as file_: | ||||
|         for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): | ||||
|             text = phrase.string | ||||
|             key = hash_string(text) | ||||
|             count = counts[key] | ||||
|             if count != 0: | ||||
|                 file_.write('%d\t%s\n' % (count, text)) | ||||
|     print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  | @ -13,24 +13,29 @@ Input data: | |||
| https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip | ||||
| 
 | ||||
| Developed for: spaCy 1.7.1 | ||||
| Last tested for: spaCy 1.7.1 | ||||
| Last tested for: spaCy 2.0.0a13 | ||||
| ''' | ||||
| from __future__ import unicode_literals, print_function | ||||
| import plac | ||||
| from pathlib import Path | ||||
| import random | ||||
| import json | ||||
| import tqdm | ||||
| 
 | ||||
| from thinc.neural.optimizers import Adam | ||||
| from thinc.neural.ops import NumpyOps | ||||
| 
 | ||||
| import spacy.orth as orth_funcs | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.pipeline import BeamEntityRecognizer | ||||
| from spacy.pipeline import EntityRecognizer | ||||
| from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy.tokens import Doc | ||||
| from spacy.attrs import * | ||||
| from spacy.gold import GoldParse | ||||
| from spacy.gold import _iob_to_biluo as iob_to_biluo | ||||
| from spacy.gold import iob_to_biluo | ||||
| from spacy.gold import minibatch | ||||
| from spacy.scorer import Scorer | ||||
| import spacy.util | ||||
| 
 | ||||
| 
 | ||||
| try: | ||||
|     unicode | ||||
|  | @ -38,96 +43,38 @@ except NameError: | |||
|     unicode = str | ||||
| 
 | ||||
| 
 | ||||
| spacy.util.set_env_log(True) | ||||
| 
 | ||||
| 
 | ||||
| def init_vocab(): | ||||
|     return Vocab( | ||||
|         lex_attr_getters={ | ||||
|             LOWER: lambda string: string.lower(), | ||||
|             SHAPE: orth_funcs.word_shape, | ||||
|             NORM: lambda string: string.lower(), | ||||
|             PREFIX: lambda string: string[0], | ||||
|             SUFFIX: lambda string: string[-3:], | ||||
|             CLUSTER: lambda string: 0, | ||||
|             IS_ALPHA: orth_funcs.is_alpha, | ||||
|             IS_ASCII: orth_funcs.is_ascii, | ||||
|             IS_DIGIT: lambda string: string.isdigit(), | ||||
|             IS_LOWER: orth_funcs.is_lower, | ||||
|             IS_PUNCT: orth_funcs.is_punct, | ||||
|             IS_SPACE: lambda string: string.isspace(), | ||||
|             IS_TITLE: orth_funcs.is_title, | ||||
|             IS_UPPER: orth_funcs.is_upper, | ||||
|             IS_STOP: lambda string: False, | ||||
|             IS_OOV: lambda string: True | ||||
|         }) | ||||
| 
 | ||||
| 
 | ||||
| def save_vocab(vocab, path): | ||||
|     path = Path(path) | ||||
|     if not path.exists(): | ||||
|         path.mkdir() | ||||
|     elif not path.is_dir(): | ||||
|         raise IOError("Can't save vocab to %s\nNot a directory" % path) | ||||
|     with (path / 'strings.json').open('w') as file_: | ||||
|         vocab.strings.dump(file_) | ||||
|     vocab.dump((path / 'lexemes.bin').as_posix()) | ||||
| 
 | ||||
| 
 | ||||
| def load_vocab(path): | ||||
|     path = Path(path) | ||||
|     if not path.exists(): | ||||
|         raise IOError("Cannot load vocab from %s\nDoes not exist" % path) | ||||
|     if not path.is_dir(): | ||||
|         raise IOError("Cannot load vocab from %s\nNot a directory" % path) | ||||
|     return Vocab.load(path) | ||||
| 
 | ||||
| 
 | ||||
| def init_ner_model(vocab, features=None): | ||||
|     if features is None: | ||||
|         features = tuple(EntityRecognizer.feature_templates) | ||||
|     return EntityRecognizer(vocab, features=features) | ||||
| 
 | ||||
| 
 | ||||
| def save_ner_model(model, path): | ||||
|     path = Path(path) | ||||
|     if not path.exists(): | ||||
|         path.mkdir() | ||||
|     if not path.is_dir(): | ||||
|         raise IOError("Can't save model to %s\nNot a directory" % path) | ||||
|     model.model.dump((path / 'model').as_posix()) | ||||
|     with (path / 'config.json').open('w') as file_: | ||||
|         data = json.dumps(model.cfg) | ||||
|         if not isinstance(data, unicode): | ||||
|             data = data.decode('utf8') | ||||
|         file_.write(data) | ||||
| 
 | ||||
| 
 | ||||
| def load_ner_model(vocab, path): | ||||
|     return EntityRecognizer.load(path, vocab) | ||||
| 
 | ||||
| 
 | ||||
| class Pipeline(object): | ||||
|     @classmethod | ||||
|     def load(cls, path): | ||||
|         path = Path(path) | ||||
|         if not path.exists(): | ||||
|             raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) | ||||
|         if not path.is_dir(): | ||||
|             raise IOError("Cannot load pipeline from %s\nNot a directory" % path) | ||||
|         vocab = load_vocab(path) | ||||
|         tokenizer = Tokenizer(vocab, {}, None, None, None) | ||||
|         ner_model = load_ner_model(vocab, path / 'ner') | ||||
|         return cls(vocab, tokenizer, ner_model) | ||||
| 
 | ||||
|     def __init__(self, vocab=None, tokenizer=None, entity=None): | ||||
|         if vocab is None: | ||||
|             vocab = init_vocab() | ||||
|         if tokenizer is None: | ||||
|             tokenizer = Tokenizer(vocab, {}, None, None, None) | ||||
|         if entity is None: | ||||
|             entity = init_ner_model(self.vocab) | ||||
|             entity = NeuralEntityRecognizer(vocab) | ||||
|         self.vocab = vocab | ||||
|         self.tokenizer = tokenizer | ||||
|         self.entity = entity | ||||
|         self.pipeline = [self.entity] | ||||
| 
 | ||||
|     def begin_training(self): | ||||
|         for model in self.pipeline: | ||||
|             model.begin_training([]) | ||||
|         optimizer = Adam(NumpyOps(), 0.001) | ||||
|         return optimizer | ||||
| 
 | ||||
|     def __call__(self, input_): | ||||
|         doc = self.make_doc(input_) | ||||
|         for process in self.pipeline: | ||||
|  | @ -147,14 +94,16 @@ class Pipeline(object): | |||
|         gold = GoldParse(doc, entities=annotations) | ||||
|         return gold | ||||
| 
 | ||||
|     def update(self, input_, annot): | ||||
|         doc = self.make_doc(input_) | ||||
|         gold = self.make_gold(input_, annot) | ||||
|         for ner in gold.ner: | ||||
|             if ner not in (None, '-', 'O'): | ||||
|                 action, label = ner.split('-', 1) | ||||
|                 self.entity.add_label(label) | ||||
|         return self.entity.update(doc, gold) | ||||
|     def update(self, inputs, annots, sgd, losses=None, drop=0.): | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         docs = [self.make_doc(input_) for input_ in inputs] | ||||
|         golds = [self.make_gold(input_, annot) for input_, annot in | ||||
|                  zip(inputs, annots)] | ||||
| 
 | ||||
|         self.entity.update(docs, golds, drop=drop, | ||||
|                            sgd=sgd, losses=losses) | ||||
|         return losses | ||||
| 
 | ||||
|     def evaluate(self, examples): | ||||
|         scorer = Scorer() | ||||
|  | @ -164,34 +113,36 @@ class Pipeline(object): | |||
|             scorer.score(doc, gold) | ||||
|         return scorer.scores | ||||
| 
 | ||||
|     def average_weights(self): | ||||
|         self.entity.model.end_training() | ||||
| 
 | ||||
|     def save(self, path): | ||||
|     def to_disk(self, path): | ||||
|         path = Path(path) | ||||
|         if not path.exists(): | ||||
|             path.mkdir() | ||||
|         elif not path.is_dir(): | ||||
|             raise IOError("Can't save pipeline to %s\nNot a directory" % path) | ||||
|         save_vocab(self.vocab, path / 'vocab') | ||||
|         save_ner_model(self.entity, path / 'ner') | ||||
|         self.vocab.to_disk(path / 'vocab') | ||||
|         self.entity.to_disk(path / 'ner') | ||||
| 
 | ||||
|     def from_disk(self, path): | ||||
|         path = Path(path) | ||||
|         if not path.exists(): | ||||
|             raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) | ||||
|         if not path.is_dir(): | ||||
|             raise IOError("Cannot load pipeline from %s\nNot a directory" % path) | ||||
|         self.vocab = self.vocab.from_disk(path / 'vocab') | ||||
|         self.entity = self.entity.from_disk(path / 'ner') | ||||
| 
 | ||||
| 
 | ||||
| def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): | ||||
|     next_epoch = train_examples | ||||
| def train(nlp, train_examples, dev_examples, nr_epoch=5): | ||||
|     sgd = nlp.begin_training() | ||||
|     print("Iter", "Loss", "P", "R", "F") | ||||
|     for i in range(nr_epoch): | ||||
|         this_epoch = next_epoch | ||||
|         next_epoch = [] | ||||
|         loss = 0 | ||||
|         for input_, annot in this_epoch: | ||||
|             loss += nlp.update(input_, annot) | ||||
|             if (i+1) < nr_epoch: | ||||
|                 next_epoch.append((input_, annot)) | ||||
|         random.shuffle(next_epoch) | ||||
|         random.shuffle(train_examples) | ||||
|         losses = {} | ||||
|         for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8): | ||||
|             inputs, annots = zip(*batch) | ||||
|             nlp.update(list(inputs), list(annots), sgd, losses=losses) | ||||
|         scores = nlp.evaluate(dev_examples) | ||||
|         report_scores(i, loss, scores) | ||||
|     nlp.average_weights() | ||||
|         report_scores(i, losses['ner'], scores) | ||||
|     scores = nlp.evaluate(dev_examples) | ||||
|     report_scores(channels, i+1, loss, scores) | ||||
| 
 | ||||
|  | @ -208,7 +159,8 @@ def read_examples(path): | |||
|     with path.open() as file_: | ||||
|         sents = file_.read().strip().split('\n\n') | ||||
|         for sent in sents: | ||||
|             if not sent.strip(): | ||||
|             sent = sent.strip() | ||||
|             if not sent: | ||||
|                 continue | ||||
|             tokens = sent.split('\n') | ||||
|             while tokens and tokens[0].startswith('#'): | ||||
|  | @ -217,28 +169,39 @@ def read_examples(path): | |||
|             iob = [] | ||||
|             for token in tokens: | ||||
|                 if token.strip(): | ||||
|                     pieces = token.split() | ||||
|                     pieces = token.split('\t') | ||||
|                     words.append(pieces[1]) | ||||
|                     iob.append(pieces[2]) | ||||
|             yield words, iob_to_biluo(iob) | ||||
| 
 | ||||
| 
 | ||||
| def get_labels(examples): | ||||
|     labels = set() | ||||
|     for words, tags in examples: | ||||
|         for tag in tags: | ||||
|             if '-' in tag: | ||||
|                 labels.add(tag.split('-')[1]) | ||||
|     return sorted(labels) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model_dir=("Path to save the model", "positional", None, Path), | ||||
|     train_loc=("Path to your training data", "positional", None, Path), | ||||
|     dev_loc=("Path to your development data", "positional", None, Path), | ||||
| ) | ||||
| def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), | ||||
|         train_loc=None, dev_loc=None, nr_epoch=30): | ||||
|      | ||||
|     train_examples = read_examples(train_loc) | ||||
| def main(model_dir, train_loc, dev_loc, nr_epoch=30): | ||||
|     print(model_dir, train_loc, dev_loc) | ||||
|     train_examples = list(read_examples(train_loc)) | ||||
|     dev_examples = read_examples(dev_loc) | ||||
|     nlp = Pipeline.load(model_dir) | ||||
|     nlp = Pipeline() | ||||
|     for label in get_labels(train_examples): | ||||
|         nlp.entity.add_label(label) | ||||
|         print("Add label", label) | ||||
| 
 | ||||
|     train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) | ||||
|     train(nlp, train_examples, list(dev_examples), nr_epoch) | ||||
| 
 | ||||
|     nlp.save(model_dir) | ||||
|     nlp.to_disk(model_dir) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|     plac.call(main) | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ For more details, see the documentation: | |||
| * Saving and loading models: https://spacy.io/docs/usage/saving-loading | ||||
| 
 | ||||
| Developed for: spaCy 1.7.6 | ||||
| Last tested for: spaCy 1.7.6 | ||||
| Last updated for: spaCy 2.0.0a13 | ||||
| """ | ||||
| from __future__ import unicode_literals, print_function | ||||
| 
 | ||||
|  | @ -34,55 +34,41 @@ from pathlib import Path | |||
| import random | ||||
| 
 | ||||
| import spacy | ||||
| from spacy.gold import GoldParse | ||||
| from spacy.tagger import Tagger | ||||
| from spacy.gold import GoldParse, minibatch | ||||
| from spacy.pipeline import NeuralEntityRecognizer | ||||
| from spacy.pipeline import TokenVectorEncoder | ||||
| 
 | ||||
| 
 | ||||
| def get_gold_parses(tokenizer, train_data): | ||||
|     '''Shuffle and create GoldParse objects''' | ||||
|     random.shuffle(train_data) | ||||
|     for raw_text, entity_offsets in train_data: | ||||
|         doc = tokenizer(raw_text) | ||||
|         gold = GoldParse(doc, entities=entity_offsets) | ||||
|         yield doc, gold | ||||
| 
 | ||||
|   | ||||
| def train_ner(nlp, train_data, output_dir): | ||||
|     # Add new words to vocab | ||||
|     for raw_text, _ in train_data: | ||||
|         doc = nlp.make_doc(raw_text) | ||||
|         for word in doc: | ||||
|             _ = nlp.vocab[word.orth] | ||||
|     random.seed(0) | ||||
|     # You may need to change the learning rate. It's generally difficult to | ||||
|     # guess what rate you should set, especially when you have limited data. | ||||
|     nlp.entity.model.learn_rate = 0.001 | ||||
|     for itn in range(1000): | ||||
|         random.shuffle(train_data) | ||||
|         loss = 0. | ||||
|         for raw_text, entity_offsets in train_data: | ||||
|             gold = GoldParse(doc, entities=entity_offsets) | ||||
|             # By default, the GoldParse class assumes that the entities | ||||
|             # described by offset are complete, and all other words should | ||||
|             # have the tag 'O'. You can tell it to make no assumptions | ||||
|             # about the tag of a word by giving it the tag '-'. | ||||
|             # However, this allows a trivial solution to the current | ||||
|             # learning problem: if words are either 'any tag' or 'ANIMAL', | ||||
|             # the model can learn that all words can be tagged 'ANIMAL'. | ||||
|             #for i in range(len(gold.ner)): | ||||
|                 #if not gold.ner[i].endswith('ANIMAL'): | ||||
|                 #    gold.ner[i] = '-' | ||||
|             doc = nlp.make_doc(raw_text) | ||||
|             nlp.tagger(doc) | ||||
|             # As of 1.9, spaCy's parser now lets you supply a dropout probability | ||||
|             # This might help the model generalize better from only a few | ||||
|             # examples. | ||||
|             loss += nlp.entity.update(doc, gold, drop=0.9) | ||||
|         if loss == 0: | ||||
|             break | ||||
|     # This step averages the model's weights. This may or may not be good for | ||||
|     # your situation --- it's empirical. | ||||
|     nlp.end_training() | ||||
|     if output_dir: | ||||
|         if not output_dir.exists(): | ||||
|     optimizer = nlp.begin_training(lambda: []) | ||||
|     nlp.meta['name'] = 'en_ent_animal' | ||||
|     for itn in range(50): | ||||
|         losses = {} | ||||
|         for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3): | ||||
|             docs, golds = zip(*batch) | ||||
|             nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True, | ||||
|                        drop=0.35) | ||||
|         print(losses) | ||||
|     if not output_dir: | ||||
|         return | ||||
|     elif not output_dir.exists(): | ||||
|         output_dir.mkdir() | ||||
|         nlp.save_to_directory(output_dir) | ||||
|     nlp.to_disk(output_dir) | ||||
| 
 | ||||
| 
 | ||||
| def main(model_name, output_directory=None): | ||||
|     print("Loading initial model", model_name) | ||||
|     nlp = spacy.load(model_name) | ||||
|     print("Creating initial model", model_name) | ||||
|     nlp = spacy.blank(model_name) | ||||
|     if output_directory is not None: | ||||
|         output_directory = Path(output_directory) | ||||
| 
 | ||||
|  | @ -91,6 +77,11 @@ def main(model_name, output_directory=None): | |||
|             "Horses are too tall and they pretend to care about your feelings", | ||||
|             [(0, 6, 'ANIMAL')], | ||||
|         ), | ||||
|         ( | ||||
|             "Do they bite?",  | ||||
|             [], | ||||
|         ), | ||||
|   | ||||
|         ( | ||||
|             "horses are too tall and they pretend to care about your feelings", | ||||
|             [(0, 6, 'ANIMAL')] | ||||
|  | @ -109,18 +100,20 @@ def main(model_name, output_directory=None): | |||
|         ) | ||||
| 
 | ||||
|     ] | ||||
|     nlp.entity.add_label('ANIMAL') | ||||
|     nlp.pipeline.append(TokenVectorEncoder(nlp.vocab)) | ||||
|     nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab)) | ||||
|     nlp.pipeline[-1].add_label('ANIMAL') | ||||
|     train_ner(nlp, train_data, output_directory) | ||||
| 
 | ||||
|     # Test that the entity is recognized | ||||
|     doc = nlp('Do you like horses?') | ||||
|     text = 'Do you like horses?' | ||||
|     print("Ents in 'Do you like horses?':") | ||||
|     doc = nlp(text) | ||||
|     for ent in doc.ents: | ||||
|         print(ent.label_, ent.text) | ||||
|     if output_directory: | ||||
|         print("Loading from", output_directory) | ||||
|         nlp2 = spacy.load('en', path=output_directory) | ||||
|         nlp2.entity.add_label('ANIMAL') | ||||
|         nlp2 = spacy.load(output_directory) | ||||
|         doc2 = nlp2('Do you like horses?') | ||||
|         for ent in doc2.ents: | ||||
|             print(ent.label_, ent.text) | ||||
|  |  | |||
|  | @ -1,3 +1,7 @@ | |||
| '''Train a multi-label convolutional neural network text classifier, | ||||
| using the spacy.pipeline.TextCategorizer component. The model is then added | ||||
| to spacy.pipeline, and predictions are available at `doc.cats`. | ||||
| ''' | ||||
| from __future__ import unicode_literals | ||||
| import plac | ||||
| import random | ||||
|  | @ -12,6 +16,11 @@ from spacy.gold import GoldParse, minibatch | |||
| from spacy.util import compounding | ||||
| from spacy.pipeline import TextCategorizer | ||||
| 
 | ||||
| # TODO: Remove this once we're not supporting models trained with thinc <6.9.0 | ||||
| import thinc.neural._classes.layernorm | ||||
| thinc.neural._classes.layernorm.set_compat_six_eight(False) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def train_textcat(tokenizer, textcat, | ||||
|                   train_texts, train_cats, dev_texts, dev_cats, | ||||
|  | @ -24,14 +33,15 @@ def train_textcat(tokenizer, textcat, | |||
|     train_docs = [tokenizer(text) for text in train_texts] | ||||
|     train_gold = [GoldParse(doc, cats=cats) for doc, cats in | ||||
|                   zip(train_docs, train_cats)] | ||||
|     train_data = zip(train_docs, train_gold) | ||||
|     train_data = list(zip(train_docs, train_gold)) | ||||
|     batch_sizes = compounding(4., 128., 1.001) | ||||
|     for i in range(n_iter): | ||||
|         losses = {} | ||||
|         train_data = tqdm.tqdm(train_data, leave=False) # Progress bar | ||||
|         for batch in minibatch(train_data, size=batch_sizes): | ||||
|         # Progress bar and minibatching | ||||
|         batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes) | ||||
|         for batch in batches: | ||||
|             docs, golds = zip(*batch) | ||||
|             textcat.update((docs, None), golds, sgd=optimizer, drop=0.2, | ||||
|             textcat.update(docs, golds, sgd=optimizer, drop=0.2, | ||||
|                 losses=losses) | ||||
|         with textcat.model.use_params(optimizer.averages): | ||||
|             scores = evaluate(tokenizer, textcat, dev_texts, dev_cats) | ||||
|  | @ -61,12 +71,13 @@ def evaluate(tokenizer, textcat, texts, cats): | |||
|     return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}   | ||||
| 
 | ||||
| 
 | ||||
| def load_data(): | ||||
| def load_data(limit=0): | ||||
|     # Partition off part of the train data --- avoid running experiments | ||||
|     # against test. | ||||
|     train_data, _ = thinc.extra.datasets.imdb() | ||||
| 
 | ||||
|     random.shuffle(train_data) | ||||
|     train_data = train_data[-limit:] | ||||
| 
 | ||||
|     texts, labels = zip(*train_data) | ||||
|     cats = [(['POSITIVE'] if y else []) for y in labels] | ||||
|  | @ -86,7 +97,7 @@ def main(model_loc=None): | |||
|     textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE']) | ||||
| 
 | ||||
|     print("Load IMDB data") | ||||
|     (train_texts, train_cats), (dev_texts, dev_cats) = load_data() | ||||
|     (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=1000) | ||||
| 
 | ||||
|     print("Itn.\tLoss\tP\tR\tF") | ||||
|     progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}' | ||||
|  |  | |||
							
								
								
									
										30
									
								
								examples/vectors_fast_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								examples/vectors_fast_text.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | |||
| '''Load vectors for a language trained using FastText | ||||
| 
 | ||||
| https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md | ||||
| ''' | ||||
| from __future__ import unicode_literals | ||||
| import plac | ||||
| import numpy | ||||
| 
 | ||||
| import spacy.language | ||||
| 
 | ||||
| 
 | ||||
| def main(vectors_loc): | ||||
|     nlp = spacy.language.Language() | ||||
| 
 | ||||
|     with open(vectors_loc, 'rb') as file_: | ||||
|         header = file_.readline() | ||||
|         nr_row, nr_dim = header.split() | ||||
|         nlp.vocab.clear_vectors(int(nr_dim)) | ||||
|         for line in file_: | ||||
|             line = line.decode('utf8') | ||||
|             pieces = line.split()  | ||||
|             word = pieces[0] | ||||
|             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') | ||||
|             nlp.vocab.set_vector(word, vector) | ||||
|     doc = nlp(u'class colspan') | ||||
|     print(doc[0].similarity(doc[1])) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     plac.call(main) | ||||
							
								
								
									
										5
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								fabfile.py
									
									
									
									
										vendored
									
									
								
							|  | @ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV) | |||
| def env(lang='python2.7'): | ||||
|     if path.exists(VENV_DIR): | ||||
|         local('rm -rf {env}'.format(env=VENV_DIR)) | ||||
|     local('pip install virtualenv') | ||||
|     local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -32,6 +33,10 @@ def make(): | |||
|             local('pip install -r requirements.txt') | ||||
|             local('python setup.py build_ext --inplace') | ||||
| 
 | ||||
| def sdist(): | ||||
|     with virtualenv(VENV_DIR): | ||||
|         with lcd(path.dirname(__file__)): | ||||
|             local('python setup.py sdist') | ||||
| 
 | ||||
| def clean(): | ||||
|     with lcd(path.dirname(__file__)): | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| cython<0.24 | ||||
| cython>=0.24,<0.27.0 | ||||
| pathlib | ||||
| numpy>=1.7 | ||||
| cymem>=1.30,<1.32 | ||||
| preshed>=1.0.0,<2.0.0 | ||||
| thinc>=6.8.0,<6.9.0 | ||||
| thinc>=6.9.0,<6.10.0 | ||||
| murmurhash>=0.28,<0.29 | ||||
| plac<1.0.0,>=0.9.6 | ||||
| six | ||||
|  | @ -13,7 +13,7 @@ requests>=2.13.0,<3.0.0 | |||
| regex==2017.4.5 | ||||
| ftfy>=4.4.2,<5.0.0 | ||||
| pytest>=3.0.6,<4.0.0 | ||||
| pip>=9.0.0,<10.0.0 | ||||
| mock>=2.0.0,<3.0.0 | ||||
| msgpack-python | ||||
| msgpack-numpy | ||||
| html5lib==1.0b8 | ||||
|  |  | |||
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -195,9 +195,8 @@ def setup_package(): | |||
|                 'murmurhash>=0.28,<0.29', | ||||
|                 'cymem>=1.30,<1.32', | ||||
|                 'preshed>=1.0.0,<2.0.0', | ||||
|                 'thinc>=6.8.0,<6.9.0', | ||||
|                 'thinc>=6.9.0,<6.10.0', | ||||
|                 'plac<1.0.0,>=0.9.6', | ||||
|                 'pip>=9.0.0,<10.0.0', | ||||
|                 'six', | ||||
|                 'pathlib', | ||||
|                 'ujson>=1.35', | ||||
|  |  | |||
|  | @ -4,11 +4,13 @@ from __future__ import unicode_literals | |||
| from .cli.info import info as cli_info | ||||
| from .glossary import explain | ||||
| from .deprecated import resolve_load_name | ||||
| #from .about import __version__ | ||||
| from .about import __version__ | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
| def load(name, **overrides): | ||||
|     from .deprecated import resolve_load_name | ||||
|     name = resolve_load_name(name, **overrides) | ||||
|     return util.load_model(name, **overrides) | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ if __name__ == '__main__': | |||
|     import plac | ||||
|     import sys | ||||
|     from spacy.cli import download, link, info, package, train, convert, model | ||||
|     from spacy.cli import profile | ||||
|     from spacy.cli import profile, evaluate | ||||
|     from spacy.util import prints | ||||
| 
 | ||||
|     commands = { | ||||
|  | @ -15,6 +15,7 @@ if __name__ == '__main__': | |||
|         'link': link, | ||||
|         'info': info, | ||||
|         'train': train, | ||||
|         'evaluate': evaluate, | ||||
|         'convert': convert, | ||||
|         'package': package, | ||||
|         'model': model, | ||||
|  |  | |||
							
								
								
									
										153
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										153
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -1,28 +1,27 @@ | |||
| import ujson | ||||
| from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||
| from thinc.i2v import HashEmbed, StaticVectors | ||||
| from thinc.t2t import ExtractWindow, ParametricAttention | ||||
| from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool | ||||
| from thinc.misc import Residual | ||||
| from thinc.misc import BatchNorm as BN | ||||
| from thinc.misc import LayerNorm as LN | ||||
| 
 | ||||
| from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | ||||
| from thinc.neural import Model, Maxout, Softmax, Affine | ||||
| from thinc.neural._classes.hash_embed import HashEmbed | ||||
| from thinc.api import FeatureExtracter, with_getitem | ||||
| from thinc.api import uniqued, wrap, flatten_add_lengths, noop | ||||
| 
 | ||||
| from thinc.linear.linear import LinearModel | ||||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| import random | ||||
| import cytoolz | ||||
| 
 | ||||
| from thinc.neural._classes.convolution import ExtractWindow | ||||
| from thinc.neural._classes.static_vectors import StaticVectors | ||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | ||||
| from thinc.neural._classes.layernorm import LayerNorm as LN | ||||
| from thinc.neural._classes.resnet import Residual | ||||
| from thinc.neural import ReLu | ||||
| from thinc.neural._classes.selu import SELU | ||||
| from thinc import describe | ||||
| from thinc.describe import Dimension, Synapses, Biases, Gradient | ||||
| from thinc.neural._classes.affine import _set_dimensions_if_needed | ||||
| from thinc.api import FeatureExtracter, with_getitem | ||||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool | ||||
| from thinc.neural._classes.attention import ParametricAttention | ||||
| from thinc.linear.linear import LinearModel | ||||
| from thinc.api import uniqued, wrap, flatten_add_lengths | ||||
| 
 | ||||
| import thinc.extra.load_nlp | ||||
| 
 | ||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER | ||||
| from .tokens.doc import Doc | ||||
|  | @ -31,6 +30,11 @@ from . import util | |||
| import numpy | ||||
| import io | ||||
| 
 | ||||
| # TODO: Unset this once we don't want to support models previous models. | ||||
| import thinc.neural._classes.layernorm | ||||
| thinc.neural._classes.layernorm.set_compat_six_eight(True) | ||||
| 
 | ||||
| VECTORS_KEY = 'spacy_pretrained_vectors' | ||||
| 
 | ||||
| @layerize | ||||
| def _flatten_add_lengths(seqs, pad=0, drop=0.): | ||||
|  | @ -225,33 +229,80 @@ def drop_layer(layer, factor=2.): | |||
|     model.predict = layer | ||||
|     return model | ||||
| 
 | ||||
| def link_vectors_to_models(vocab): | ||||
|     vectors = vocab.vectors | ||||
|     ops = Model.ops | ||||
|     for word in vocab: | ||||
|         if word.orth in vectors.key2row: | ||||
|             word.rank = vectors.key2row[word.orth] | ||||
|         else: | ||||
|             word.rank = 0 | ||||
|     data = ops.asarray(vectors.data) | ||||
|     # Set an entry here, so that vectors are accessed by StaticVectors | ||||
|     # (unideal, I know) | ||||
|     thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data | ||||
| 
 | ||||
| def Tok2Vec(width, embed_size, preprocess=None): | ||||
| def Tok2Vec(width, embed_size, **kwargs): | ||||
|     pretrained_dims = kwargs.get('pretrained_dims', 0) | ||||
|     cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): | ||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, | ||||
|                                  '*': reapply}): | ||||
|         norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') | ||||
|         prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') | ||||
|         suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') | ||||
|         shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') | ||||
|         if pretrained_dims is not None and pretrained_dims >= 1: | ||||
|             glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID)) | ||||
| 
 | ||||
|             embed = uniqued( | ||||
|                 (glove | norm | prefix | suffix | shape) | ||||
|                 >> LN(Maxout(width, width*5, pieces=3)), column=5) | ||||
|         else: | ||||
|             embed = uniqued( | ||||
|                 (norm | prefix | suffix | shape) | ||||
|                 >> LN(Maxout(width, width*4, pieces=3)), column=5) | ||||
| 
 | ||||
| 
 | ||||
|         convolution = Residual( | ||||
|             ExtractWindow(nW=1) | ||||
|             >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) | ||||
|         ) | ||||
| 
 | ||||
|         embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) | ||||
|         tok2vec = ( | ||||
|             with_flatten( | ||||
|                 asarray(Model.ops, dtype='uint64') | ||||
|                 >> uniqued(embed, column=5) | ||||
|                 >> Residual( | ||||
|                     (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) | ||||
|                 ) ** 4, pad=4 | ||||
|             FeatureExtracter(cols) | ||||
|             >> with_flatten( | ||||
|                 embed >> (convolution ** 4), pad=4) | ||||
|         ) | ||||
|         ) | ||||
|         if preprocess not in (False, None): | ||||
|             tok2vec = preprocess >> tok2vec | ||||
| 
 | ||||
|         # Work around thinc API limitations :(. TODO: Revise in Thinc 7 | ||||
|         tok2vec.nO = width | ||||
|         tok2vec.embed = embed | ||||
|     return tok2vec | ||||
| 
 | ||||
| 
 | ||||
| def reapply(layer, n_times): | ||||
|     def reapply_fwd(X, drop=0.): | ||||
|         backprops = [] | ||||
|         for i in range(n_times): | ||||
|             Y, backprop = layer.begin_update(X, drop=drop) | ||||
|             X = Y | ||||
|             backprops.append(backprop) | ||||
|         def reapply_bwd(dY, sgd=None): | ||||
|             dX = None | ||||
|             for backprop in reversed(backprops): | ||||
|                 dY = backprop(dY, sgd=sgd) | ||||
|                 if dX is None: | ||||
|                     dX = dY | ||||
|                 else: | ||||
|                     dX += dY | ||||
|             return dX | ||||
|         return Y, reapply_bwd | ||||
|     return wrap(reapply_fwd, layer) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def asarray(ops, dtype): | ||||
|     def forward(X, drop=0.): | ||||
|         return ops.asarray(X, dtype=dtype), None | ||||
|  | @ -455,20 +506,25 @@ def getitem(i): | |||
|         return X[i], None | ||||
|     return layerize(getitem_fwd) | ||||
| 
 | ||||
| def build_tagger_model(nr_class, token_vector_width, **cfg): | ||||
|     embed_size = util.env_opt('embed_size', 7500) | ||||
| def build_tagger_model(nr_class, **cfg): | ||||
|     embed_size = util.env_opt('embed_size', 7000) | ||||
|     if 'token_vector_width' in cfg: | ||||
|         token_vector_width = cfg['token_vector_width'] | ||||
|     else: | ||||
|         token_vector_width = util.env_opt('token_vector_width', 128) | ||||
|     pretrained_dims = cfg.get('pretrained_dims', 0) | ||||
|     with Model.define_operators({'>>': chain, '+': add}): | ||||
|         # Input: (doc, tensor) tuples | ||||
|         private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) | ||||
| 
 | ||||
|         if 'tok2vec' in cfg: | ||||
|             tok2vec = cfg['tok2vec'] | ||||
|         else: | ||||
|             tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||
|                               pretrained_dims=pretrained_dims) | ||||
|         model = ( | ||||
|             fine_tune(private_tok2vec) | ||||
|             >> with_flatten( | ||||
|                 Maxout(token_vector_width, token_vector_width) | ||||
|                 >> Softmax(nr_class, token_vector_width) | ||||
|             ) | ||||
|             tok2vec | ||||
|             >> with_flatten(Softmax(nr_class, token_vector_width)) | ||||
|         ) | ||||
|     model.nI = None | ||||
|     model.tok2vec = tok2vec | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
|  | @ -514,6 +570,7 @@ def foreach(layer, drop_factor=1.0): | |||
| 
 | ||||
| def build_text_classifier(nr_class, width=64, **cfg): | ||||
|     nr_vector = cfg.get('nr_vector', 5000) | ||||
|     pretrained_dims = cfg.get('pretrained_dims', 0) | ||||
|     with Model.define_operators({'>>': chain, '+': add, '|': concatenate, | ||||
|                                  '**': clone}): | ||||
|         if cfg.get('low_data'): | ||||
|  | @ -521,7 +578,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|                 SpacyVectors | ||||
|                 >> flatten_add_lengths | ||||
|                 >> with_getitem(0, | ||||
|                     Affine(width, 300) | ||||
|                     Affine(width, pretrained_dims) | ||||
|                 ) | ||||
|                 >> ParametricAttention(width) | ||||
|                 >> Pooling(sum_pool) | ||||
|  | @ -548,18 +605,24 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|         if pretrained_dims: | ||||
|             static_vectors = ( | ||||
|                 SpacyVectors | ||||
|             >> with_flatten(Affine(width, 300)) | ||||
|                 >> with_flatten(Affine(width, pretrained_dims)) | ||||
|             ) | ||||
| 
 | ||||
|         cnn_model = ( | ||||
|             # TODO Make concatenate support lists | ||||
|             concatenate_lists(trained_vectors, static_vectors) | ||||
|             vectors = concatenate_lists(trained_vectors, static_vectors) | ||||
|             vectors_width = width*2 | ||||
|         else: | ||||
|             vectors = trained_vectors | ||||
|             vectors_width = width | ||||
|             static_vectors = None | ||||
|         cnn_model = ( | ||||
|             vectors | ||||
|             >> with_flatten( | ||||
|                 LN(Maxout(width, width*2)) | ||||
|                 LN(Maxout(width, vectors_width)) | ||||
|                 >> Residual( | ||||
|                     (ExtractWindow(nW=1) >> zero_init(Maxout(width, width*3))) | ||||
|                     (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) | ||||
|                 ) ** 2, pad=2 | ||||
|             ) | ||||
|             >> flatten_add_lengths | ||||
|  | @ -579,7 +642,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) | ||||
|             >> logistic | ||||
|         ) | ||||
| 
 | ||||
|     model.nO = nr_class | ||||
|     model.lsuv = False | ||||
|     return model | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,14 +3,15 @@ | |||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||
| 
 | ||||
| __title__ = 'spacy-nightly' | ||||
| __version__ = '2.0.0a13' | ||||
| __version__ = '2.0.0a16' | ||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||
| __uri__ = 'https://spacy.io' | ||||
| __author__ = 'Explosion AI' | ||||
| __email__ = 'contact@explosion.ai' | ||||
| __license__ = 'MIT' | ||||
| __release__ = True | ||||
| 
 | ||||
| __docs_models__ = 'https://spacy.io/docs/usage/models' | ||||
| __docs_models__ = 'https://alpha.spacy.io/usage/models' | ||||
| __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | ||||
| __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | ||||
| __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| # Reserve 64 values for flag features | ||||
| cpdef enum attr_id_t: | ||||
| cdef enum attr_id_t: | ||||
|     NULL_ATTR | ||||
|     IS_ALPHA | ||||
|     IS_ASCII | ||||
|  |  | |||
|  | @ -94,6 +94,7 @@ IDS = { | |||
| 
 | ||||
| # ATTR IDs, in order of the symbol | ||||
| NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] | ||||
| locals().update(IDS) | ||||
| 
 | ||||
| 
 | ||||
| def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||
|  |  | |||
|  | @ -4,5 +4,6 @@ from .link import link | |||
| from .package import package | ||||
| from .profile import profile | ||||
| from .train import train | ||||
| from .evaluate import evaluate | ||||
| from .convert import convert | ||||
| from .model import model | ||||
|  |  | |||
|  | @ -14,7 +14,7 @@ from ..util import prints | |||
| CONVERTERS = { | ||||
|     '.conllu': conllu2json, | ||||
|     '.conll': conllu2json, | ||||
|     '.iob': iob2json | ||||
|     '.iob': iob2json, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| from cytoolz import partition_all, concat | ||||
| 
 | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
|  | @ -10,11 +11,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | |||
|     """ | ||||
|     Convert IOB files into JSON format for use with train cli. | ||||
|     """ | ||||
|     # TODO: This isn't complete yet -- need to map from IOB to | ||||
|     # BILUO | ||||
|     with input_path.open('r', encoding='utf8') as file_: | ||||
|         docs = read_iob(file_) | ||||
| 
 | ||||
|         sentences = read_iob(file_) | ||||
|     docs = merge_sentences(sentences, n_sents) | ||||
|     output_filename = input_path.parts[-1].replace(".iob", ".json") | ||||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|  | @ -23,9 +22,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | |||
|            title="Generated output file %s" % path2str(output_file)) | ||||
| 
 | ||||
| 
 | ||||
| def read_iob(file_): | ||||
| def read_iob(raw_sents): | ||||
|     sentences = [] | ||||
|     for line in file_: | ||||
|     for line in raw_sents: | ||||
|         if not line.strip(): | ||||
|             continue | ||||
|         tokens = [t.split('|') for t in line.split()] | ||||
|  | @ -43,3 +42,15 @@ def read_iob(file_): | |||
|     paragraphs = [{'sentences': [sent]} for sent in sentences] | ||||
|     docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] | ||||
|     return docs | ||||
| 
 | ||||
| def merge_sentences(docs, n_sents): | ||||
|     counter = 0 | ||||
|     merged = [] | ||||
|     for group in partition_all(n_sents, docs): | ||||
|         group = list(group) | ||||
|         first = group.pop(0) | ||||
|         to_extend = first['paragraphs'][0]['sentences'] | ||||
|         for sent in group[1:]: | ||||
|             to_extend.extend(sent['paragraphs'][0]['sentences']) | ||||
|         merged.append(first) | ||||
|     return merged | ||||
|  |  | |||
							
								
								
									
										119
									
								
								spacy/cli/evaluate.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										119
									
								
								spacy/cli/evaluate.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,119 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals, division, print_function | ||||
| 
 | ||||
| import plac | ||||
| import json | ||||
| from collections import defaultdict | ||||
| import cytoolz | ||||
| from pathlib import Path | ||||
| import dill | ||||
| import tqdm | ||||
| from thinc.neural._classes.model import Model | ||||
| from thinc.neural.optimizers import linear_decay | ||||
| from timeit import default_timer as timer | ||||
| import random | ||||
| import numpy.random | ||||
| 
 | ||||
| from ..tokens.doc import Doc | ||||
| from ..scorer import Scorer | ||||
| from ..gold import GoldParse, merge_sents | ||||
| from ..gold import GoldCorpus, minibatch | ||||
| from ..util import prints | ||||
| from .. import util | ||||
| from .. import about | ||||
| from .. import displacy | ||||
| from ..compat import json_dumps | ||||
| 
 | ||||
| random.seed(0) | ||||
| numpy.random.seed(0) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     model=("Model name or path", "positional", None, str), | ||||
|     data_path=("Location of JSON-formatted evaluation data", "positional", None, str), | ||||
|     gold_preproc=("Use gold preprocessing", "flag", "G", bool), | ||||
|     gpu_id=("Use GPU", "option", "g", int), | ||||
|     displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), | ||||
|     displacy_limit=("Limit of parses to render as HTML", "option", "dl", int) | ||||
| ) | ||||
| def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False, | ||||
|              displacy_path=None, displacy_limit=25): | ||||
|     """ | ||||
|     Evaluate a model. To render a sample of parses in a HTML file, set an output | ||||
|     directory as the displacy_path argument. | ||||
|     """ | ||||
|     util.use_gpu(gpu_id) | ||||
|     util.set_env_log(False) | ||||
|     data_path = util.ensure_path(data_path) | ||||
|     displacy_path = util.ensure_path(displacy_path) | ||||
|     if not data_path.exists(): | ||||
|         prints(data_path, title="Evaluation data not found", exits=1) | ||||
|     if displacy_path and not displacy_path.exists(): | ||||
|         prints(displacy_path, title="Visualization output directory not found", exits=1) | ||||
|     corpus = GoldCorpus(data_path, data_path) | ||||
|     nlp = util.load_model(model) | ||||
|     dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) | ||||
|     begin = timer() | ||||
|     scorer = nlp.evaluate(dev_docs, verbose=False) | ||||
|     end = timer() | ||||
|     nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) | ||||
|     print_results(scorer, time=end - begin, words=nwords, | ||||
|                   wps=nwords / (end - begin)) | ||||
|     if displacy_path: | ||||
|         docs, golds = zip(*dev_docs) | ||||
|         render_deps = 'parser' in nlp.meta.get('pipeline', []) | ||||
|         render_ents = 'ner' in nlp.meta.get('pipeline', []) | ||||
|         render_parses(docs, displacy_path, model_name=model, limit=displacy_limit, | ||||
|                       deps=render_deps, ents=render_ents) | ||||
|         prints(displacy_path, title="Generated %s parses as HTML" % displacy_limit) | ||||
| 
 | ||||
| 
 | ||||
| def render_parses(docs, output_path, model_name='', limit=250, deps=True, ents=True): | ||||
|     docs[0].user_data['title'] = model_name | ||||
|     if ents: | ||||
|         with (output_path / 'entities.html').open('w') as file_: | ||||
|             html = displacy.render(docs[:limit], style='ent', page=True) | ||||
|             file_.write(html) | ||||
|     if deps: | ||||
|         with (output_path / 'parses.html').open('w') as file_: | ||||
|             html = displacy.render(docs[:limit], style='dep', page=True, options={'compact': True}) | ||||
|             file_.write(html) | ||||
| 
 | ||||
| 
 | ||||
| def print_progress(itn, losses, dev_scores, wps=0.0): | ||||
|     scores = {} | ||||
|     for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', | ||||
|                 'ents_p', 'ents_r', 'ents_f', 'wps']: | ||||
|         scores[col] = 0.0 | ||||
|     scores['dep_loss'] = losses.get('parser', 0.0) | ||||
|     scores['ner_loss'] = losses.get('ner', 0.0) | ||||
|     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||
|     scores.update(dev_scores) | ||||
|     scores['wps'] = wps | ||||
|     tpl = '\t'.join(( | ||||
|         '{:d}', | ||||
|         '{dep_loss:.3f}', | ||||
|         '{ner_loss:.3f}', | ||||
|         '{uas:.3f}', | ||||
|         '{ents_p:.3f}', | ||||
|         '{ents_r:.3f}', | ||||
|         '{ents_f:.3f}', | ||||
|         '{tags_acc:.3f}', | ||||
|         '{token_acc:.3f}', | ||||
|         '{wps:.1f}')) | ||||
|     print(tpl.format(itn, **scores)) | ||||
| 
 | ||||
| 
 | ||||
| def print_results(scorer, time, words, wps): | ||||
|     results = { | ||||
|         'Time': '%.2f s' % time, | ||||
|         'Words': words, | ||||
|         'Words/s': '%.0f' % wps, | ||||
|         'TOK': '%.2f' % scorer.token_acc, | ||||
|         'POS': '%.2f' % scorer.tags_acc, | ||||
|         'UAS': '%.2f' % scorer.uas, | ||||
|         'LAS': '%.2f' % scorer.las, | ||||
|         'NER P': '%.2f' % scorer.ents_p, | ||||
|         'NER R': '%.2f' % scorer.ents_r, | ||||
|         'NER F': '%.2f' % scorer.ents_f} | ||||
|     util.print_table(results, title="Results") | ||||
|  | @ -105,8 +105,11 @@ def generate_pipeline(): | |||
|            "parser, ner. For more information, see the docs on processing pipelines.", | ||||
|            title="Enter your model's pipeline components") | ||||
|     pipeline = util.get_raw_input("Pipeline components", True) | ||||
|     replace = {'True': True, 'False': False} | ||||
|     return replace[pipeline] if pipeline in replace else pipeline.split(', ') | ||||
|     subs = {'True': True, 'False': False} | ||||
|     if pipeline in subs: | ||||
|         return subs[pipeline] | ||||
|     else: | ||||
|         return [p.strip() for p in pipeline.split(',')] | ||||
| 
 | ||||
| 
 | ||||
| def validate_meta(meta, keys): | ||||
|  |  | |||
|  | @ -8,8 +8,11 @@ import cytoolz | |||
| from pathlib import Path | ||||
| import dill | ||||
| import tqdm | ||||
| from thinc.neural._classes.model import Model | ||||
| from thinc.neural.optimizers import linear_decay | ||||
| from timeit import default_timer as timer | ||||
| import random | ||||
| import numpy.random | ||||
| 
 | ||||
| from ..tokens.doc import Doc | ||||
| from ..scorer import Scorer | ||||
|  | @ -17,9 +20,13 @@ from ..gold import GoldParse, merge_sents | |||
| from ..gold import GoldCorpus, minibatch | ||||
| from ..util import prints | ||||
| from .. import util | ||||
| from .. import about | ||||
| from .. import displacy | ||||
| from ..compat import json_dumps | ||||
| 
 | ||||
| random.seed(0) | ||||
| numpy.random.seed(0) | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     lang=("model language", "positional", None, str), | ||||
|  | @ -29,15 +36,17 @@ from ..compat import json_dumps | |||
|     n_iter=("number of iterations", "option", "n", int), | ||||
|     n_sents=("number of sentences", "option", "ns", int), | ||||
|     use_gpu=("Use GPU", "option", "g", int), | ||||
|     resume=("Whether to resume training", "flag", "R", bool), | ||||
|     vectors=("Model to load vectors from", "option", "v"), | ||||
|     no_tagger=("Don't train tagger", "flag", "T", bool), | ||||
|     no_parser=("Don't train parser", "flag", "P", bool), | ||||
|     no_entities=("Don't train NER", "flag", "N", bool), | ||||
|     gold_preproc=("Use gold preprocessing", "flag", "G", bool), | ||||
|     version=("Model version", "option", "V", str), | ||||
|     meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) | ||||
| ) | ||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | ||||
|           use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, | ||||
|           gold_preproc=False): | ||||
| def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, | ||||
|           use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, | ||||
|           gold_preproc=False, version="0.0.0", meta_path=None): | ||||
|     """ | ||||
|     Train a model. Expects data in spaCy's JSON format. | ||||
|     """ | ||||
|  | @ -46,19 +55,24 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|     output_path = util.ensure_path(output_dir) | ||||
|     train_path = util.ensure_path(train_data) | ||||
|     dev_path = util.ensure_path(dev_data) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     if not output_path.exists(): | ||||
|         output_path.mkdir() | ||||
|     if not train_path.exists(): | ||||
|         prints(train_path, title="Training data not found", exits=1) | ||||
|     if dev_path and not dev_path.exists(): | ||||
|         prints(dev_path, title="Development data not found", exits=1) | ||||
|     if meta_path is not None and not meta_path.exists(): | ||||
|         prints(meta_path, title="meta.json not found", exits=1) | ||||
|     meta = util.read_json(meta_path) if meta_path else {} | ||||
|     if not isinstance(meta, dict): | ||||
|         prints("Expected dict but got: {}".format(type(meta)), | ||||
|                title="Not a valid meta.json format", exits=1) | ||||
| 
 | ||||
|     lang_class = util.get_lang_class(lang) | ||||
| 
 | ||||
|     pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] | ||||
|     if no_tagger and 'tags' in pipeline: pipeline.remove('tags') | ||||
|     if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') | ||||
|     if no_entities and 'entities' in pipeline: pipeline.remove('entities') | ||||
|     pipeline = ['tagger', 'parser', 'ner'] | ||||
|     if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') | ||||
|     if no_parser and 'parser' in pipeline: pipeline.remove('parser') | ||||
|     if no_entities and 'ner' in pipeline: pipeline.remove('ner') | ||||
| 
 | ||||
|     # Take dropout and batch size as generators of values -- dropout | ||||
|     # starts high and decays sharply, to force the optimizer to explore. | ||||
|  | @ -68,33 +82,30 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|                                   util.env_opt('dropout_to', 0.2), | ||||
|                                   util.env_opt('dropout_decay', 0.0)) | ||||
|     batch_sizes = util.compounding(util.env_opt('batch_from', 1), | ||||
|                                    util.env_opt('batch_to', 64), | ||||
|                                    util.env_opt('batch_to', 16), | ||||
|                                    util.env_opt('batch_compound', 1.001)) | ||||
| 
 | ||||
|     if resume: | ||||
|         prints(output_path / 'model9.pickle', title="Resuming training") | ||||
|         nlp = dill.load((output_path / 'model9.pickle').open('rb')) | ||||
|     else: | ||||
|         nlp = lang_class(pipeline=pipeline) | ||||
|     corpus = GoldCorpus(train_path, dev_path, limit=n_sents) | ||||
|     n_train_words = corpus.count_train() | ||||
| 
 | ||||
|     lang_class = util.get_lang_class(lang) | ||||
|     nlp = lang_class(pipeline=pipeline) | ||||
|     if vectors: | ||||
|         util.load_model(vectors, vocab=nlp.vocab) | ||||
|     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) | ||||
|     nlp._optimizer = None | ||||
| 
 | ||||
|     print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") | ||||
|     print("Itn.\tP.Loss\tN.Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") | ||||
|     try: | ||||
|         for i in range(n_iter): | ||||
|             if resume: | ||||
|                 i += 20 | ||||
|             with tqdm.tqdm(total=n_train_words, leave=False) as pbar: | ||||
|         train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, | ||||
|                                        gold_preproc=gold_preproc, max_length=0) | ||||
|         train_docs = list(train_docs) | ||||
|         for i in range(n_iter): | ||||
|             with tqdm.tqdm(total=n_train_words, leave=False) as pbar: | ||||
|                 losses = {} | ||||
|                 for batch in minibatch(train_docs, size=batch_sizes): | ||||
|                     docs, golds = zip(*batch) | ||||
|                     nlp.update(docs, golds, sgd=optimizer, | ||||
|                                drop=next(dropout_rates), losses=losses, | ||||
|                                update_shared=True) | ||||
|                                drop=next(dropout_rates), losses=losses) | ||||
|                     pbar.update(sum(len(doc) for doc in docs)) | ||||
| 
 | ||||
|             with nlp.use_params(optimizer.averages): | ||||
|  | @ -104,12 +115,22 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|                 nlp_loaded = lang_class(pipeline=pipeline) | ||||
|                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) | ||||
|                 scorer = nlp_loaded.evaluate( | ||||
|                             corpus.dev_docs( | ||||
|                             list(corpus.dev_docs( | ||||
|                                 nlp_loaded, | ||||
|                                 gold_preproc=gold_preproc)) | ||||
|                                 gold_preproc=gold_preproc))) | ||||
|                 acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') | ||||
|                 with acc_loc.open('w') as file_: | ||||
|                     file_.write(json_dumps(scorer.scores)) | ||||
|                 meta_loc = output_path / ('model%d' % i) / 'meta.json' | ||||
|                 meta['accuracy'] = scorer.scores | ||||
|                 meta['lang'] = nlp.lang | ||||
|                 meta['pipeline'] = pipeline | ||||
|                 meta['spacy_version'] = '>=%s' % about.__version__ | ||||
|                 meta.setdefault('name', 'model%d' % i) | ||||
|                 meta.setdefault('version', version) | ||||
| 
 | ||||
|                 with meta_loc.open('w') as file_: | ||||
|                     file_.write(json_dumps(meta)) | ||||
|                 util.set_env_log(True) | ||||
|             print_progress(i, losses, scorer.scores) | ||||
|     finally: | ||||
|  | @ -138,12 +159,14 @@ def print_progress(itn, losses, dev_scores, wps=0.0): | |||
|                 'ents_p', 'ents_r', 'ents_f', 'wps']: | ||||
|         scores[col] = 0.0 | ||||
|     scores['dep_loss'] = losses.get('parser', 0.0) | ||||
|     scores['ner_loss'] = losses.get('ner', 0.0) | ||||
|     scores['tag_loss'] = losses.get('tagger', 0.0) | ||||
|     scores.update(dev_scores) | ||||
|     scores['wps'] = wps | ||||
|     tpl = '\t'.join(( | ||||
|         '{:d}', | ||||
|         '{dep_loss:.3f}', | ||||
|         '{ner_loss:.3f}', | ||||
|         '{uas:.3f}', | ||||
|         '{ents_p:.3f}', | ||||
|         '{ents_r:.3f}', | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ import re | |||
| import ujson | ||||
| import random | ||||
| import cytoolz | ||||
| import itertools | ||||
| 
 | ||||
| from .syntax import nonproj | ||||
| from .util import ensure_path | ||||
|  | @ -146,9 +147,13 @@ def minibatch(items, size=8): | |||
|     '''Iterate over batches of items. `size` may be an iterator, | ||||
|     so that batch-size can vary on each step. | ||||
|     ''' | ||||
|     if isinstance(size, int): | ||||
|         size_ = itertools.repeat(8) | ||||
|     else: | ||||
|         size_ = size | ||||
|     items = iter(items) | ||||
|     while True: | ||||
|         batch_size = next(size) #if hasattr(size, '__next__') else size | ||||
|         batch_size = next(size_) | ||||
|         batch = list(cytoolz.take(int(batch_size), items)) | ||||
|         if len(batch) == 0: | ||||
|             break | ||||
|  |  | |||
|  | @ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm | |||
|           'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' | ||||
|           'TB T G M K %') | ||||
| _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' | ||||
| _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' | ||||
| _punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ ·' | ||||
| _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' | ||||
| _hyphens = '- – — -- ---' | ||||
| _hyphens = '- – — -- --- —— ~' | ||||
| _other_symbols = r'[\p{So}]' | ||||
| 
 | ||||
| UNITS = merge_chars(_units) | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .norm_exceptions import NORM_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lemmatizer import LOOKUP | ||||
|  | @ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults): | |||
|                                          NORM_EXCEPTIONS, BASE_NORMS) | ||||
| 
 | ||||
|     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) | ||||
|     infixes = tuple(TOKENIZER_INFIXES) | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
|     syntax_iterators = dict(SYNTAX_ITERATORS) | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/de/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| 
 | ||||
| 
 | ||||
| _quotes = QUOTES.replace("'", '') | ||||
| 
 | ||||
| _infixes = (LIST_ELLIPSES + LIST_ICONS + | ||||
|             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), | ||||
|              r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), | ||||
|              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), | ||||
|              r'(?<=[0-9])-(?=[0-9])']) | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | |||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH | ||||
| from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .lemmatizer import LOOKUP | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| 
 | ||||
|  | @ -17,6 +18,7 @@ from ...util import update_exc, add_lookups | |||
| 
 | ||||
| class FrenchDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
|     lex_attr_getters[LANG] = lambda text: 'fr' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										41
									
								
								spacy/lang/fr/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								spacy/lang/fr/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,41 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = set(""" | ||||
| zero un deux trois quatre cinq six sept huit neuf dix | ||||
| onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf | ||||
| vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante | ||||
| cent mille mil million milliard billion quadrillion quintillion | ||||
| sextillion septillion octillion nonillion decillion | ||||
| """.split()) | ||||
| 
 | ||||
| _ordinal_words = set(""" | ||||
| premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième | ||||
| onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième | ||||
| vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième | ||||
| centième millième millionnième milliardième billionnième quadrillionnième quintillionnième | ||||
| sextillionnième septillionnième octillionnième nonillionnième decillionnième | ||||
| """.split()) | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     # Might require more work? | ||||
|     # See this discussion: https://github.com/explosion/spaCy/pull/1161 | ||||
|     text = text.replace(',', '').replace('.', '') | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count('/') == 1: | ||||
|         num, denom = text.split('/') | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     if text in _num_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = { | ||||
|     LIKE_NUM: like_num | ||||
| } | ||||
|  | @ -2,6 +2,7 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
|  | @ -12,6 +13,7 @@ from ...util import update_exc, add_lookups | |||
| 
 | ||||
| class DutchDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters.update(LEX_ATTRS) | ||||
|     lex_attr_getters[LANG] = lambda text: 'nl' | ||||
|     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										40
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								spacy/lang/nl/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = set(""" | ||||
| nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien | ||||
| veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd | ||||
| duizend miljoen miljard biljoen biljard triljoen triljard | ||||
| """.split()) | ||||
| 
 | ||||
| _ordinal_words = set(""" | ||||
| eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde | ||||
| twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste | ||||
| zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste | ||||
| miljardste biljoenste biljardste triljoenste triljardste | ||||
| """.split()) | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     # This only does the most basic check for whether a token is a digit | ||||
|     # or matches one of the number words. In order to handle numbers like | ||||
|     # "drieëntwintig", more work is required. | ||||
|     # See this discussion: https://github.com/explosion/spaCy/pull/1177 | ||||
|     text = text.replace(',', '').replace('.', '') | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count('/') == 1: | ||||
|         num, denom = text.split('/') | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     if text in _num_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = { | ||||
|     LIKE_NUM: like_num | ||||
| } | ||||
							
								
								
									
										35
									
								
								spacy/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								spacy/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .tag_map import TAG_MAP | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...tokens import Doc | ||||
| from ..norm_exceptions import BASE_NORMS | ||||
| from ...language import Language | ||||
| from ...attrs import LANG, NORM | ||||
| from ...util import update_exc, add_lookups | ||||
| 
 | ||||
| class ThaiDefaults(Language.Defaults): | ||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||
|     lex_attr_getters[LANG] = lambda text: 'th' | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     tag_map = dict(TAG_MAP) | ||||
|     stop_words = set(STOP_WORDS) | ||||
| 
 | ||||
| 
 | ||||
| class Thai(Language): | ||||
| 	lang = 'th' | ||||
| 	Defaults = ThaiDefaults | ||||
| 	def make_doc(self, text): | ||||
| 		try: | ||||
| 			from pythainlp.tokenize import word_tokenize | ||||
| 		except ImportError: | ||||
| 			raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " | ||||
| 								"https://github.com/wannaphongcom/pythainlp/") | ||||
| 		words = [x for x in list(word_tokenize(text,"newmm"))] | ||||
| 		return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
| 
 | ||||
| __all__ = ['Thai'] | ||||
							
								
								
									
										62
									
								
								spacy/lang/th/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								spacy/lang/th/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,62 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| # data from https://github.com/wannaphongcom/pythainlp/blob/dev/pythainlp/corpus/stopwords-th.txt | ||||
| # stop words as whitespace-separated list | ||||
| STOP_WORDS = set(""" | ||||
| นี้	นํา	นั้น	นัก	นอกจาก	ทุก	ที่สุด	ที่	ทําให้	ทํา	ทาง	ทั้งนี้	ดัง	ซึ่ง	ช่วง	จาก	จัด	จะ	คือ	ความ	ครั้ง	คง	ขึ้น	ของ | ||||
| ขอ	รับ	ระหว่าง	รวม	ยัง	มี	มาก	มา	พร้อม	พบ	ผ่าน	ผล	บาง	น่า	เปิดเผย	เปิด	เนื่องจาก	เดียวกัน	เดียว	เช่น	เฉพาะ	เข้า	ถ้า | ||||
| ถูก	ถึง	ต้อง	ต่างๆ	ต่าง	ต่อ	ตาม	ตั้งแต่	ตั้ง	ด้าน	ด้วย	อีก	อาจ	ออก	อย่าง	อะไร	อยู่	อยาก	หาก	หลาย	หลังจาก	แต่	เอง	เห็น | ||||
| เลย	เริ่ม	เรา	เมื่อ	เพื่อ	เพราะ	เป็นการ	เป็น	หลัง	หรือ	หนึ่ง	ส่วน	ส่ง	สุด	สําหรับ	ว่า	ลง	ร่วม	ราย	ขณะ	ก่อน	ก็	การ	กับ	กัน | ||||
| กว่า	กล่าว	จึง	ไว้	ไป	ได้	ให้	ใน	โดย	แห่ง	แล้ว	และ	แรก	แบบ	ๆ	ทั้ง	วัน	เขา	เคย	ไม่	อยาก	เกิน	เกินๆ	เกี่ยวกัน	เกี่ยวกับ | ||||
| เกี่ยวข้อง	เกี่ยวเนื่อง	เกี่ยวๆ	เกือบ	เกือบจะ	เกือบๆ	แก	แก่	แก้ไข	ใกล้	ใกล้ๆ	ไกล	ไกลๆ	ขณะเดียวกัน	ขณะใด	ขณะใดๆ	ขณะที่	ขณะนั้น	ขณะนี้	ขณะหนึ่ง	ขวาง | ||||
| ขวางๆ	ขั้น	ใคร	ใคร่	ใคร่จะ	ใครๆ	ง่าย	ง่ายๆ	ไง	จง	จด	จน	จนกระทั่ง	จนกว่า	จนขณะนี้	จนตลอด	จนถึง	จนทั่ว	จนบัดนี้	จนเมื่อ	จนแม้	จนแม้น | ||||
| จรด	จรดกับ	จริง	จริงจัง	จริงๆ	จริงๆจังๆ	จวน	จวนจะ	จวนเจียน	จวบ	ซึ่งก็	ซึ่งก็คือ	ซึ่งกัน	ซึ่งกันและกัน	ซึ่งได้แก่	ซึ่งๆ	ณ	ด้วย	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยประการฉะนี้ | ||||
| ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น	ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดั่ง	ดังกล่าว	ดังกับ	ดั่งกับ	ดังกับว่า	ดั่งกับว่า	ดังเก่า | ||||
| ดั่งเก่า	ดังเคย	ใดๆ	ได้	ได้แก่	ได้แต่	ได้ที่	ได้มา	ได้รับ	ตน	ตนเอง	ตนฯ	ตรง	ตรงๆ	ตลอด	ตลอดกาล	ตลอดกาลนาน	ตลอดจน	ตลอดถึง	ตลอดทั้ง | ||||
| ตลอดทั่ว	ตลอดทั่วถึง	ตลอดทั่วทั้ง	ตลอดปี	ตลอดไป	ตลอดมา	ตลอดระยะเวลา	ตลอดวัน	ตลอดเวลา	ตลอดศก	ต่อ	ต่อกัน	ถึงแก่	ถึงจะ	ถึงบัดนั้น	ถึงบัดนี้ | ||||
| ถึงเมื่อ	ถึงเมื่อใด	ถึงเมื่อไร	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือ	ถือว่า	ถูกต้อง	ถูกๆ	เถอะ	เถิด	ทรง	ทว่า	ทั้งคน	ทั้งตัว	ทั้งที	ทั้งที่	ทั้งนั้น	ทั้งนั้นด้วย	ทั้งนั้นเพราะ | ||||
| นอก	นอกจากที่	นอกจากนั้น	นอกจากนี้	นอกจากว่า	นอกนั้น	นอกเหนือ	นอกเหนือจาก	น้อย	น้อยกว่า	น้อยๆ	นะ	น่ะ	นักๆ	นั่น	นั่นไง	นั่นเป็น	นั่นแหละ | ||||
| นั่นเอง	นั้นๆ	นับ	นับจากนั้น	นับจากนี้	นับตั้งแต่	นับแต่	นับแต่ที่	นับแต่นั้น	เป็นต้น	เป็นต้นไป	เป็นต้นมา	เป็นแต่	เป็นแต่เพียง	เป็นที	เป็นที่	เป็นที่สุด	เป็นเพราะ | ||||
| เป็นเพราะว่า	เป็นเพียง	เป็นเพียงว่า	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอันว่า	เป็นอันๆ	เป็นอาทิ	เป็นๆ	เปลี่ยน	เปลี่ยนแปลง	เปิด	เปิดเผย	ไป่	ผ่าน	ผ่านๆ | ||||
| ผิด	ผิดๆ	ผู้	เพียงเพื่อ	เพียงไร	เพียงไหน	เพื่อที่	เพื่อที่จะ	เพื่อว่า	เพื่อให้	ภาค	ภาคฯ	ภาย	ภายใต้	ภายนอก	ภายใน	ภายภาค	ภายภาคหน้า	ภายหน้า	ภายหลัง | ||||
| มอง	มองว่า	มัก	มักจะ	มัน	มันๆ	มั้ย	มั้ยนะ	มั้ยนั่น	มั้ยเนี่ย	มั้ยล่ะ	ยืนนาน	ยืนยง	ยืนยัน	ยืนยาว	เยอะ	เยอะแยะ	เยอะๆ	แยะ	แยะๆ	รวด	รวดเร็ว	ร่วม	รวมกัน	ร่วมกัน | ||||
| รวมด้วย	ร่วมด้วย	รวมถึง	รวมทั้ง	ร่วมมือ	รวมๆ	ระยะ	ระยะๆ	ระหว่าง	รับรอง	รึ	รึว่า	รือ	รือว่า	สิ้นกาลนาน	สืบเนื่อง	สุดๆ	สู่	สูง	สูงกว่า	สูงส่ง	สูงสุด	สูงๆ	เสมือนกับ | ||||
| เสมือนว่า	เสร็จ	เสร็จกัน	เสร็จแล้ว	เสร็จสมบูรณ์	เสร็จสิ้น	เสีย	เสียก่อน	เสียจน	เสียจนกระทั่ง	เสียจนถึง	เสียด้วย	เสียนั่น	เสียนั่นเอง	เสียนี่	เสียนี่กระไร	เสียยิ่ง | ||||
| เสียยิ่งนัก	เสียแล้ว	ใหญ่ๆ	ให้ดี	ให้แด่	ให้ไป	ใหม่	ให้มา	ใหม่ๆ	ไหน	ไหนๆ	อดีต	อนึ่ง	อย่าง	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น | ||||
| อย่างนี้	อย่างโน้น	ก็คือ	ก็แค่	ก็จะ	ก็ดี	ก็ได้	ก็ต่อเมื่อ	ก็ตาม	ก็ตามแต่	ก็ตามที	ก็แล้วแต่	กระทั่ง	กระทำ	กระนั้น	กระผม	กลับ	กล่าวคือ	กลุ่ม	กลุ่มก้อน | ||||
| กลุ่มๆ	กว้าง	กว้างขวาง	กว้างๆ	ก่อนหน้า	ก่อนหน้านี้	ก่อนๆ	กันดีกว่า	กันดีไหม	กันเถอะ	กันนะ	กันและกัน	กันไหม	กันเอง	กำลัง	กำลังจะ	กำหนด	กู	เก็บ | ||||
| เกิด	เกี่ยวข้อง	แก่	แก้ไข	ใกล้	ใกล้ๆ	ข้า	ข้าง	ข้างเคียง	ข้างต้น	ข้างบน	ข้างล่าง	ข้างๆ	ขาด	ข้าพเจ้า	ข้าฯ	เข้าใจ	เขียน	คงจะ	คงอยู่	ครบ	ครบครัน	ครบถ้วน | ||||
| ครั้งกระนั้น	ครั้งก่อน	ครั้งครา	ครั้งคราว	ครั้งใด	ครั้งที่	ครั้งนั้น	ครั้งนี้	ครั้งละ	ครั้งหนึ่ง	ครั้งหลัง	ครั้งหลังสุด	ครั้งไหน	ครั้งๆ	ครัน	ครับ	ครา	คราใด	คราที่	ครานั้น	ครานี้	คราหนึ่ง | ||||
| คราไหน	คราว	คราวก่อน	คราวใด	คราวที่	คราวนั้น	คราวนี้	คราวโน้น	คราวละ	คราวหน้า	คราวหนึ่ง	คราวหลัง	คราวไหน	คราวๆ	คล้าย	คล้ายกัน	คล้ายกันกับ | ||||
| คล้ายกับ	คล้ายกับว่า	คล้ายว่า	ควร	ค่อน	ค่อนข้าง	ค่อนข้างจะ	ค่อยไปทาง	ค่อนมาทาง	ค่อย	ค่อยๆ	คะ	ค่ะ	คำ	คิด	คิดว่า	คุณ	คุณๆ | ||||
| เคยๆ	แค่	แค่จะ	แค่นั้น	แค่นี้	แค่เพียง	แค่ว่า	แค่ไหน	ใคร่	ใคร่จะ	ง่าย	ง่ายๆ	จนกว่า	จนแม้	จนแม้น	จังๆ	จวบกับ	จวบจน	จ้ะ	จ๊ะ	จะได้	จัง	จัดการ	จัดงาน	จัดแจง | ||||
| จัดตั้ง	จัดทำ	จัดหา	จัดให้	จับ	จ้า	จ๋า	จากนั้น	จากนี้ 	จากนี้ไป	จำ	จำเป็น 	จำพวก	จึงจะ	จึงเป็น	จู่ๆ	ฉะนั้น	ฉะนี้	ฉัน	เฉกเช่น	เฉย	เฉยๆ	ไฉน	ช่วงก่อน | ||||
| ช่วงต่อไป	ช่วงถัดไป	ช่วงท้าย	ช่วงที่	ช่วงนั้น	ช่วงนี้	ช่วงระหว่าง	ช่วงแรก	ช่วงหน้า	ช่วงหลัง	ช่วงๆ	ช่วย	ช้า	ช้านาน	ชาว	ช้าๆ	เช่นก่อน	เช่นกัน	เช่นเคย | ||||
| เช่นดัง	เช่นดังก่อน	เช่นดังเก่า	เช่นดังที่	เช่นดังว่า	เช่นเดียวกัน	เช่นเดียวกับ	เช่นใด	เช่นที่	เช่นที่เคย	เช่นที่ว่า	เช่นนั้น	เช่นนั้นเอง	เช่นนี้	เช่นเมื่อ	เช่นไร	เชื่อ | ||||
| เชื่อถือ	เชื่อมั่น	เชื่อว่า	ใช่	ใช่ไหม	ใช้	ซะ	ซะก่อน	ซะจน	ซะจนกระทั่ง	ซะจนถึง	ซึ่งได้แก่	ด้วยกัน	ด้วยเช่นกัน	ด้วยที่	ด้วยเพราะ	ด้วยว่า	ด้วยเหตุที่	ด้วยเหตุนั้น | ||||
| ด้วยเหตุนี้	ด้วยเหตุเพราะ	ด้วยเหตุว่า	ด้วยเหมือนกัน	ดังกล่าว	ดังกับว่า	ดั่งกับว่า	ดังเก่า	ดั่งเก่า	ดั่งเคย	ต่างก็	ต่างหาก	ตามด้วย	ตามแต่	ตามที่ | ||||
| ตามๆ	เต็มไปด้วย	เต็มไปหมด	เต็มๆ	แต่ก็	แต่ก่อน	แต่จะ	แต่เดิม	แต่ต้อง	แต่ถ้า	แต่ทว่า	แต่ที่	แต่นั้น	แต่เพียง	แต่เมื่อ	แต่ไร	แต่ละ	แต่ว่า	แต่ไหน	แต่อย่างใด	โต | ||||
| โตๆ	ใต้	ถ้าจะ	ถ้าหาก	ถึงแก่	ถึงแม้	ถึงแม้จะ	ถึงแม้ว่า	ถึงอย่างไร	ถือว่า	ถูกต้อง	ทว่า	ทั้งนั้นด้วย	ทั้งปวง	ทั้งเป็น	ทั้งมวล	ทั้งสิ้น	ทั้งหมด	ทั้งหลาย	ทั้งๆ	ทัน | ||||
| ทันใดนั้น	ทันที	ทันทีทันใด	ทั่ว	ทำไม	ทำไร	ทำให้	ทำๆ	ที	ที่จริง	ที่ซึ่ง	ทีเดียว	ทีใด	ที่ใด	ที่ได้	ทีเถอะ	ที่แท้	ที่แท้จริง	ที่นั้น	ที่นี้	ทีไร	ทีละ	ที่ละ | ||||
| ที่แล้ว	ที่ว่า	ที่แห่งนั้น	ที่ไหน	ทีๆ	ที่ๆ	ทุกคน	ทุกครั้ง	ทุกครา	ทุกคราว	ทุกชิ้น	ทุกตัว	ทุกทาง	ทุกที	ทุกที่	ทุกเมื่อ	ทุกวัน	ทุกวันนี้	ทุกสิ่ง	ทุกหน	ทุกแห่ง	ทุกอย่าง | ||||
| ทุกอัน	ทุกๆ	เท่า	เท่ากัน	เท่ากับ	เท่าใด	เท่าที่	เท่านั้น	เท่านี้	เท่าไร	เท่าไหร่	แท้	แท้จริง	เธอ	นอกจากว่า	น้อย	น้อยกว่า	น้อยๆ	น่ะ	นั้นไว	นับแต่นี้	นาง | ||||
| นางสาว	น่าจะ	นาน	นานๆ	นาย	นำ	นำพา	นำมา	นิด	นิดหน่อย	นิดๆ	นี่	นี่ไง	นี่นา	นี่แน่ะ	นี่แหละ	นี้แหล่	นี่เอง	นี้เอง	นู่น	นู้น	เน้น	เนี่ย | ||||
| เนี่ยเอง	ในช่วง	ในที่	ในเมื่อ	ในระหว่าง	บน	บอก	บอกแล้ว	บอกว่า	บ่อย	บ่อยกว่า	บ่อยครั้ง	บ่อยๆ	บัดดล	บัดเดี๋ยวนี้	บัดนั้น	บัดนี้	บ้าง	บางกว่า | ||||
| บางขณะ	บางครั้ง	บางครา	บางคราว	บางที	บางที่	บางแห่ง	บางๆ	ปฏิบัติ	ประกอบ	ประการ	ประการฉะนี้	ประการใด	ประการหนึ่ง	ประมาณ	ประสบ	ปรับ | ||||
| ปรากฏ	ปรากฏว่า	ปัจจุบัน	ปิด	เป็นด้วย	เป็นดัง	เป็นต้น	เป็นแต่	เป็นเพื่อ	เป็นอัน	เป็นอันมาก	เป็นอาทิ	ผ่านๆ	ผู้	ผู้ใด	เผื่อ	เผื่อจะ	เผื่อที่	เผื่อว่า	ฝ่าย | ||||
| ฝ่ายใด	พบว่า	พยายาม	พร้อมกัน	พร้อมกับ	พร้อมด้วย	พร้อมทั้ง	พร้อมที่	พร้อมเพียง	พวก	พวกกัน	พวกกู	พวกแก	พวกเขา	พวกคุณ	พวกฉัน	พวกท่าน | ||||
| พวกที่	พวกเธอ	พวกนั้น	พวกนี้	พวกนู้น	พวกโน้น	พวกมัน	พวกมึง	พอ	พอกัน	พอควร	พอจะ	พอดี	พอตัว	พอที	พอที่	พอเพียง	พอแล้ว	พอสม	พอสมควร | ||||
| พอเหมาะ	พอๆ	พา	พึง	พึ่ง	พื้นๆ	พูด	เพราะฉะนั้น	เพราะว่า	เพิ่ง	เพิ่งจะ	เพิ่ม	เพิ่มเติม	เพียง	เพียงแค่	เพียงใด	เพียงแต่	เพียงพอ	เพียงเพราะ | ||||
| เพื่อว่า	เพื่อให้	ภายใต้	มองว่า	มั๊ย	มากกว่า	มากมาย	มิ	มิฉะนั้น	มิใช่	มิได้	มีแต่	มึง	มุ่ง	มุ่งเน้น	มุ่งหมาย	เมื่อก่อน	เมื่อครั้ง	เมื่อครั้งก่อน | ||||
| เมื่อคราวก่อน	เมื่อคราวที่	เมื่อคราว	เมื่อคืน	เมื่อเช้า	เมื่อใด	เมื่อนั้น	เมื่อนี้	เมื่อเย็น	เมื่อไร	เมื่อวันวาน	เมื่อวาน	เมื่อไหร่	แม้	แม้กระทั่ง	แม้แต่	แม้นว่า	แม้ว่า | ||||
| ไม่ค่อย	ไม่ค่อยจะ	ไม่ค่อยเป็น	ไม่ใช่	ไม่เป็นไร	ไม่ว่า	ยก	ยกให้	ยอม	ยอมรับ	ย่อม	ย่อย	ยังคง	ยังงั้น	ยังงี้	ยังโง้น	ยังไง	ยังจะ	ยังแต่	ยาก | ||||
| ยาว	ยาวนาน	ยิ่ง	ยิ่งกว่า	ยิ่งขึ้น	ยิ่งขึ้นไป	ยิ่งจน	ยิ่งจะ	ยิ่งนัก	ยิ่งเมื่อ	ยิ่งแล้ว	ยิ่งใหญ่	ร่วมกัน	รวมด้วย	ร่วมด้วย	รือว่า	เร็ว	เร็วๆ	เราๆ	เรียก	เรียบ	เรื่อย | ||||
| เรื่อยๆ	ไร	ล้วน	ล้วนจน	ล้วนแต่	ละ	ล่าสุด	เล็ก	เล็กน้อย	เล็กๆ	เล่าว่า	แล้วกัน	แล้วแต่	แล้วเสร็จ	วันใด	วันนั้น	วันนี้	วันไหน	สบาย	สมัย	สมัยก่อน | ||||
| สมัยนั้น	สมัยนี้	สมัยโน้น	ส่วนเกิน	ส่วนด้อย	ส่วนดี	ส่วนใด	ส่วนที่	ส่วนน้อย	ส่วนนั้น	ส่วนมาก	ส่วนใหญ่	สั้น	สั้นๆ	สามารถ	สำคัญ	สิ่ง | ||||
| สิ่งใด	สิ่งนั้น	สิ่งนี้	สิ่งไหน	สิ้น	เสร็จแล้ว	เสียด้วย	เสียแล้ว	แสดง	แสดงว่า	หน	หนอ	หนอย	หน่อย	หมด	หมดกัน	หมดสิ้น	หรือไง	หรือเปล่า	หรือไม่	หรือยัง | ||||
| หรือไร	หากแม้	หากแม้น	หากแม้นว่า	หากว่า	หาความ	หาใช่	หารือ	เหตุ	เหตุผล	เหตุนั้น	เหตุนี้	เหตุไร	เห็นแก่	เห็นควร	เห็นจะ	เห็นว่า	เหลือ	เหลือเกิน	เหล่า | ||||
| เหล่านั้น	เหล่านี้	แห่งใด	แห่งนั้น	แห่งนี้	แห่งโน้น	แห่งไหน	แหละ	ให้แก่	ใหญ่	ใหญ่โต	อย่างเช่น	อย่างดี	อย่างเดียว	อย่างใด	อย่างที่	อย่างน้อย	อย่างนั้น	อย่างนี้ | ||||
| อย่างโน้น	อย่างมาก	อย่างยิ่ง	อย่างไร	อย่างไรก็	อย่างไรก็ได้	อย่างไรเสีย	อย่างละ	อย่างหนึ่ง	อย่างไหน	อย่างๆ	อัน	อันจะ	อันใด	อันได้แก่	อันที่ | ||||
| อันที่จริง	อันที่จะ	อันเนื่องมาจาก	อันละ	อันไหน	อันๆ	อาจจะ	อาจเป็น	อาจเป็นด้วย	อื่น	อื่นๆ	เอ็ง	เอา	ฯ	ฯล	ฯลฯ | ||||
| """.split()) | ||||
							
								
								
									
										81
									
								
								spacy/lang/th/tag_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								spacy/lang/th/tag_map.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,81 @@ | |||
| # encoding: utf8 | ||||
| # data from Korakot Chaovavanich (https://www.facebook.com/photo.php?fbid=390564854695031&set=p.390564854695031&type=3&permPage=1&ifg=1) | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import * | ||||
| 
 | ||||
| TAG_MAP = { | ||||
|     #NOUN | ||||
|     "NOUN":     {POS: NOUN}, | ||||
|     "NCMN":     {POS: NOUN}, | ||||
|     "NTTL":     {POS: NOUN}, | ||||
|     "CNIT":     {POS: NOUN}, | ||||
|     "CLTV":     {POS: NOUN}, | ||||
|     "CMTR":     {POS: NOUN}, | ||||
|     "CFQC":     {POS: NOUN}, | ||||
|     "CVBL":     {POS: NOUN}, | ||||
|     #PRON | ||||
|     "PRON":     {POS: PRON}, | ||||
|     "NPRP":     {POS: PRON}, | ||||
|     # ADJ | ||||
|     "ADJ":      {POS: ADJ}, | ||||
|     "NONM":      {POS: ADJ}, | ||||
|     "VATT":      {POS: ADJ}, | ||||
|     "DONM":      {POS: ADJ}, | ||||
|     # ADV | ||||
|     "ADV":      {POS: ADV}, | ||||
|     "ADVN":      {POS: ADV}, | ||||
|     "ADVI":      {POS: ADV}, | ||||
|     "ADVP":      {POS: ADV}, | ||||
|     "ADVS":      {POS: ADV}, | ||||
| 	# INT | ||||
|     "INT":      {POS: INTJ}, | ||||
|     # PRON | ||||
|     "PROPN":    {POS: PROPN}, | ||||
|     "PPRS":    {POS: PROPN}, | ||||
|     "PDMN":    {POS: PROPN}, | ||||
|     "PNTR":    {POS: PROPN}, | ||||
|     # DET | ||||
|     "DET":      {POS: DET}, | ||||
|     "DDAN":      {POS: DET}, | ||||
|     "DDAC":      {POS: DET}, | ||||
|     "DDBQ":      {POS: DET}, | ||||
|     "DDAQ":      {POS: DET}, | ||||
|     "DIAC":      {POS: DET}, | ||||
|     "DIBQ":      {POS: DET}, | ||||
|     "DIAQ":      {POS: DET}, | ||||
|     "DCNM":      {POS: DET}, | ||||
|     # NUM | ||||
|     "NUM":      {POS: NUM}, | ||||
|     "NCNM":      {POS: NUM}, | ||||
|     "NLBL":      {POS: NUM}, | ||||
|     "DCNM":      {POS: NUM}, | ||||
| 	# AUX | ||||
|     "AUX":      {POS: AUX}, | ||||
|     "XVBM":      {POS: AUX}, | ||||
|     "XVAM":      {POS: AUX}, | ||||
|     "XVMM":      {POS: AUX}, | ||||
|     "XVBB":      {POS: AUX}, | ||||
|     "XVAE":      {POS: AUX}, | ||||
| 	# ADP | ||||
|     "ADP":      {POS: ADP}, | ||||
|     "RPRE":      {POS: ADP}, | ||||
|     # CCONJ | ||||
|     "CCONJ":    {POS: CCONJ}, | ||||
|     "JCRG":    {POS: CCONJ}, | ||||
| 	# SCONJ | ||||
|     "SCONJ":    {POS: SCONJ}, | ||||
|     "PREL":    {POS: SCONJ}, | ||||
|     "JSBR":    {POS: SCONJ}, | ||||
|     "JCMP":    {POS: SCONJ}, | ||||
|     # PART | ||||
|     "PART":    {POS: PART}, | ||||
|     "FIXN":    {POS: PART}, | ||||
|     "FIXV":    {POS: PART}, | ||||
|     "EAFF":    {POS: PART}, | ||||
|     "AITT":    {POS: PART}, | ||||
|     "NEG":    {POS: PART}, | ||||
|     # PUNCT | ||||
|     "PUNCT":    {POS: PUNCT}, | ||||
|     "PUNC":    {POS: PUNCT} | ||||
| } | ||||
							
								
								
									
										43
									
								
								spacy/lang/th/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								spacy/lang/th/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,43 @@ | |||
| # encoding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...symbols import * | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = { | ||||
|     "ม.ค.": [ | ||||
|         {ORTH: "ม.ค.", LEMMA: "มกราคม"} | ||||
|     ], | ||||
|     "ก.พ.": [ | ||||
|         {ORTH: "ก.พ.", LEMMA: "กุมภาพันธ์"} | ||||
|     ], | ||||
|     "มี.ค.": [ | ||||
|         {ORTH: "มี.ค.", LEMMA: "มีนาคม"} | ||||
|     ], | ||||
|     "เม.ย.": [ | ||||
|         {ORTH: "เม.ย.", LEMMA: "เมษายน"} | ||||
|     ], | ||||
|     "พ.ค.": [ | ||||
|         {ORTH: "พ.ค.", LEMMA: "พฤษภาคม"} | ||||
|     ], | ||||
|     "มิ.ย.": [ | ||||
|         {ORTH: "มิ.ย.", LEMMA: "มิถุนายน"} | ||||
|     ], | ||||
|     "ก.ค.": [ | ||||
|         {ORTH: "ก.ค.", LEMMA: "กรกฎาคม"} | ||||
|     ], | ||||
|     "ส.ค.": [ | ||||
|         {ORTH: "ส.ค.", LEMMA: "สิงหาคม"} | ||||
|     ], | ||||
|     "ก.ย.": [ | ||||
|         {ORTH: "ก.ย.", LEMMA: "กันยายน"} | ||||
|     ], | ||||
|     "ต.ค.": [ | ||||
|         {ORTH: "ต.ค.", LEMMA: "ตุลาคม"} | ||||
|     ], | ||||
|     "พ.ย.": [ | ||||
|         {ORTH: "พ.ย.", LEMMA: "พฤศจิกายน"} | ||||
|     ], | ||||
|     "ธ.ค.": [ | ||||
|         {ORTH: "ธ.ค.", LEMMA: "ธันวาคม"} | ||||
|     ] | ||||
| } | ||||
|  | @ -14,8 +14,8 @@ class Chinese(Language): | |||
|         except ImportError: | ||||
|             raise ImportError("The Chinese tokenizer requires the Jieba library: " | ||||
|                               "https://github.com/fxsjy/jieba") | ||||
|         words = list(jieba.cut(text, cut_all=True)) | ||||
|         words=[x for x in words if x] | ||||
|         words = list(jieba.cut(text, cut_all=False)) | ||||
|         words = [x for x in words if x] | ||||
|         return Doc(self.vocab, words=words, spaces=[False]*len(words)) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP | |||
| from .lang.lex_attrs import LEX_ATTRS | ||||
| from . import util | ||||
| from .scorer import Scorer | ||||
| from ._ml import link_vectors_to_models | ||||
| 
 | ||||
| 
 | ||||
| class BaseDefaults(object): | ||||
|  | @ -278,8 +279,7 @@ class Language(object): | |||
|     def make_doc(self, text): | ||||
|         return self.tokenizer(text) | ||||
| 
 | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None, | ||||
|             update_shared=False): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         """Update the models in the pipeline. | ||||
| 
 | ||||
|         docs (iterable): A batch of `Doc` objects. | ||||
|  | @ -303,32 +303,17 @@ class Language(object): | |||
|             if self._optimizer is None: | ||||
|                 self._optimizer = Adam(Model.ops, 0.001) | ||||
|             sgd = self._optimizer | ||||
|         tok2vec = self.pipeline[0] | ||||
|         feats = tok2vec.doc2feats(docs) | ||||
|         grads = {} | ||||
|         def get_grads(W, dW, key=None): | ||||
|             grads[key] = (W, dW) | ||||
|         pipes = list(self.pipeline[1:]) | ||||
|         pipes = list(self.pipeline) | ||||
|         random.shuffle(pipes) | ||||
|         tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) | ||||
|         all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] | ||||
|         for proc in pipes: | ||||
|             if not hasattr(proc, 'update'): | ||||
|                 continue | ||||
|             d_tokvecses = proc.update((docs, tokvecses), golds, | ||||
|                                       drop=drop, sgd=get_grads, losses=losses) | ||||
|             if update_shared and d_tokvecses is not None: | ||||
|                 for i, d_tv in enumerate(d_tokvecses): | ||||
|                     all_d_tokvecses[i] += d_tv | ||||
|         if update_shared and bp_tokvecses is not None: | ||||
|             bp_tokvecses(all_d_tokvecses, sgd=sgd) | ||||
|             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||
|         for key, (W, dW) in grads.items(): | ||||
|             sgd(W, dW, key=key) | ||||
|         # Clear the tensor variable, to free GPU memory. | ||||
|         # If we don't do this, the memory leak gets pretty | ||||
|         # bad, because we may be holding part of a batch. | ||||
|         for doc in docs: | ||||
|             doc.tensor = None | ||||
| 
 | ||||
|     def preprocess_gold(self, docs_golds): | ||||
|         """Can be called before training to pre-process gold data. By default, | ||||
|  | @ -343,36 +328,49 @@ class Language(object): | |||
|         for doc, gold in docs_golds: | ||||
|             yield doc, gold | ||||
| 
 | ||||
|     def begin_training(self, get_gold_tuples, **cfg): | ||||
|     def resume_training(self, **cfg): | ||||
|         if cfg.get('device', -1) >= 0: | ||||
|             device = util.use_gpu(cfg['device']) | ||||
|             if self.vocab.vectors.data.shape[1] >= 1: | ||||
|                 self.vocab.vectors.data = Model.ops.asarray( | ||||
|                     self.vocab.vectors.data) | ||||
|         else: | ||||
|             device = None | ||||
|         learn_rate = util.env_opt('learn_rate', 0.001) | ||||
|         beta1 = util.env_opt('optimizer_B1', 0.9) | ||||
|         beta2 = util.env_opt('optimizer_B2', 0.999) | ||||
|         eps = util.env_opt('optimizer_eps', 1e-08) | ||||
|         L2 = util.env_opt('L2_penalty', 1e-6) | ||||
|         max_grad_norm = util.env_opt('grad_norm_clip', 1.) | ||||
|         self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, | ||||
|                               beta2=beta2, eps=eps) | ||||
|         self._optimizer.max_grad_norm = max_grad_norm | ||||
|         self._optimizer.device = device | ||||
|         return self._optimizer | ||||
| 
 | ||||
|     def begin_training(self, get_gold_tuples=None, **cfg): | ||||
|         """Allocate models, pre-process training data and acquire a trainer and | ||||
|         optimizer. Used as a contextmanager. | ||||
| 
 | ||||
|         gold_tuples (iterable): Gold-standard training data. | ||||
|         get_gold_tuples (function): Function returning gold data | ||||
|         **cfg: Config parameters. | ||||
|         YIELDS (tuple): A trainer and an optimizer. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): | ||||
|             >>>    for epoch in trainer.epochs(gold): | ||||
|             >>>        for docs, golds in epoch: | ||||
|             >>>            state = nlp.update(docs, golds, sgd=optimizer) | ||||
|         returns: An optimizer | ||||
|         """ | ||||
|         if self.parser: | ||||
|             self.pipeline.append(NeuralLabeller(self.vocab)) | ||||
|         # Populate vocab | ||||
|         if get_gold_tuples is not None: | ||||
|             for _, annots_brackets in get_gold_tuples(): | ||||
|                 for annots, _ in annots_brackets: | ||||
|                     for word in annots[1]: | ||||
|                         _ = self.vocab[word] | ||||
|         contexts = [] | ||||
|         if cfg.get('device', -1) >= 0: | ||||
|             import cupy.cuda.device | ||||
|             device = cupy.cuda.device.Device(cfg['device']) | ||||
|             device.use() | ||||
|             Model.ops = CupyOps() | ||||
|             Model.Ops = CupyOps | ||||
|             device = util.use_gpu(cfg['device']) | ||||
|             if self.vocab.vectors.data.shape[1] >= 1: | ||||
|                 self.vocab.vectors.data = Model.ops.asarray( | ||||
|                     self.vocab.vectors.data) | ||||
|         else: | ||||
|             device = None | ||||
|         link_vectors_to_models(self.vocab) | ||||
|         for proc in self.pipeline: | ||||
|             if hasattr(proc, 'begin_training'): | ||||
|                 context = proc.begin_training(get_gold_tuples(), | ||||
|  | @ -390,7 +388,7 @@ class Language(object): | |||
|         self._optimizer.device = device | ||||
|         return self._optimizer | ||||
| 
 | ||||
|     def evaluate(self, docs_golds): | ||||
|     def evaluate(self, docs_golds, verbose=False): | ||||
|         scorer = Scorer() | ||||
|         docs, golds = zip(*docs_golds) | ||||
|         docs = list(docs) | ||||
|  | @ -403,8 +401,9 @@ class Language(object): | |||
|                 docs = list(pipe.pipe(docs)) | ||||
|         assert len(docs) == len(golds) | ||||
|         for doc, gold in zip(docs, golds): | ||||
|             scorer.score(doc, gold) | ||||
|             doc.tensor = None | ||||
|             if verbose: | ||||
|                 print(doc) | ||||
|             scorer.score(doc, gold, verbose=verbose) | ||||
|         return scorer | ||||
| 
 | ||||
|     @contextmanager | ||||
|  | @ -493,7 +492,6 @@ class Language(object): | |||
|         """ | ||||
|         path = util.ensure_path(path) | ||||
|         serializers = OrderedDict(( | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||
|         )) | ||||
|  | @ -505,6 +503,7 @@ class Language(object): | |||
|             if not hasattr(proc, 'to_disk'): | ||||
|                 continue | ||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||
|         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||
|         util.to_disk(path, serializers, {p: False for p in disable}) | ||||
| 
 | ||||
|     def from_disk(self, path, disable=tuple()): | ||||
|  |  | |||
|  | @ -38,7 +38,8 @@ class Lemmatizer(object): | |||
|         avoid lemmatization entirely. | ||||
|         """ | ||||
|         morphology = {} if morphology is None else morphology | ||||
|         others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] | ||||
|         others = [key for key in morphology | ||||
|                   if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')] | ||||
|         true_morph_key = morphology.get('morph', 0) | ||||
|         if univ_pos == 'noun' and morphology.get('Number') == 'sing': | ||||
|             return True | ||||
|  | @ -47,7 +48,9 @@ class Lemmatizer(object): | |||
|         # This maps 'VBP' to base form -- probably just need 'IS_BASE' | ||||
|         # morphology | ||||
|         elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ | ||||
|                                      morphology.get('Tense') == 'pres'): | ||||
|                                      morphology.get('Tense') == 'pres' and \ | ||||
|                                      morphology.get('Number') is None and \ | ||||
|                                      not others): | ||||
|             return True | ||||
|         elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': | ||||
|             return True | ||||
|  |  | |||
|  | @ -421,47 +421,69 @@ cdef class PhraseMatcher: | |||
|     cdef int max_length | ||||
|     cdef attr_t* _phrase_key | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, phrases, max_length=10): | ||||
|     cdef public object _callbacks | ||||
|     cdef public object _patterns | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, max_length=10): | ||||
|         self.mem = Pool() | ||||
|         self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t)) | ||||
|         self.max_length = max_length | ||||
|         self.vocab = vocab | ||||
|         self.matcher = Matcher(self.vocab, {}) | ||||
|         self.matcher = Matcher(self.vocab) | ||||
|         self.phrase_ids = PreshMap() | ||||
|         for phrase in phrases: | ||||
|             if len(phrase) < max_length: | ||||
|                 self.add(phrase) | ||||
| 
 | ||||
|         abstract_patterns = [] | ||||
|         for length in range(1, max_length): | ||||
|             abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) | ||||
|         self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match) | ||||
|         self.matcher.add('Candidate', None, *abstract_patterns) | ||||
|         self._callbacks = {} | ||||
| 
 | ||||
|     def add(self, Doc tokens): | ||||
|         cdef int length = tokens.length | ||||
|         assert length < self.max_length | ||||
|         tags = get_bilou(length) | ||||
|         assert len(tags) == length, length | ||||
|     def __len__(self): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def __contains__(self, key): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (self.__class__, (self.vocab,), None, None) | ||||
| 
 | ||||
|     def add(self, key, on_match, *docs): | ||||
|         cdef Doc doc | ||||
|         for doc in docs: | ||||
|             if len(doc) >= self.max_length: | ||||
|                 msg = ( | ||||
|                     "Pattern length (%d) >= phrase_matcher.max_length (%d). " | ||||
|                     "Length can be set on initialization, up to 10." | ||||
|                 ) | ||||
|                 raise ValueError(msg % (len(doc), self.max_length)) | ||||
|         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||
|         self._callbacks[ent_id] = on_match | ||||
| 
 | ||||
|         cdef int length | ||||
|         cdef int i | ||||
|         cdef hash_t phrase_hash | ||||
|         for doc in docs: | ||||
|             length = doc.length | ||||
|             tags = get_bilou(length) | ||||
|             for i in range(self.max_length): | ||||
|                 self._phrase_key[i] = 0 | ||||
|             for i, tag in enumerate(tags): | ||||
|             lexeme = self.vocab[tokens.c[i].lex.orth] | ||||
|                 lexeme = self.vocab[doc.c[i].lex.orth] | ||||
|                 lexeme.set_flag(tag, True) | ||||
|                 self._phrase_key[i] = lexeme.orth | ||||
|         cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) | ||||
|         self.phrase_ids[key] = True | ||||
|             phrase_hash = hash64(self._phrase_key, | ||||
|                                  self.max_length * sizeof(attr_t), 0) | ||||
|             self.phrase_ids.set(phrase_hash, <void*>ent_id) | ||||
| 
 | ||||
|     def __call__(self, Doc doc): | ||||
|         matches = [] | ||||
|         for ent_id, label, start, end in self.matcher(doc): | ||||
|             cand = doc[start : end] | ||||
|             start = cand[0].idx | ||||
|             end = cand[-1].idx + len(cand[-1]) | ||||
|             matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) | ||||
|         for match in matches: | ||||
|             doc.merge(*match) | ||||
|         for _, start, end in self.matcher(doc): | ||||
|             ent_id = self.accept_match(doc, start, end) | ||||
|             if ent_id is not None: | ||||
|                 matches.append((ent_id, start, end)) | ||||
|         for i, (ent_id, start, end) in enumerate(matches): | ||||
|             on_match = self._callbacks.get(ent_id) | ||||
|             if on_match is not None: | ||||
|                 on_match(self, doc, i, matches) | ||||
|         return matches | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=1000, n_threads=2): | ||||
|  | @ -469,7 +491,7 @@ cdef class PhraseMatcher: | |||
|             self(doc) | ||||
|             yield doc | ||||
| 
 | ||||
|     def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end): | ||||
|     def accept_match(self, Doc doc, int start, int end): | ||||
|         assert (end - start) < self.max_length | ||||
|         cdef int i, j | ||||
|         for i in range(self.max_length): | ||||
|  | @ -477,7 +499,8 @@ cdef class PhraseMatcher: | |||
|         for i, j in enumerate(range(start, end)): | ||||
|             self._phrase_key[i] = doc.c[j].lex.orth | ||||
|         cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) | ||||
|         if self.phrase_ids.get(key): | ||||
|             return (ent_id, label, start, end) | ||||
|         ent_id = <hash_t>self.phrase_ids.get(key) | ||||
|         if ent_id == 0: | ||||
|             return None | ||||
|         else: | ||||
|             return False | ||||
|             return ent_id | ||||
|  |  | |||
|  | @ -146,6 +146,8 @@ cdef class Morphology: | |||
|                 self.add_special_case(tag_str, form_str, attrs) | ||||
| 
 | ||||
|     def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): | ||||
|         if orth not in self.strings: | ||||
|             return orth | ||||
|         cdef unicode py_string = self.strings[orth] | ||||
|         if self.lemmatizer is None: | ||||
|             return self.strings.add(py_string.lower()) | ||||
|  |  | |||
|  | @ -4,7 +4,6 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from thinc.api import chain, layerize, with_getitem | ||||
| from thinc.neural import Model, Softmax | ||||
| import numpy | ||||
| cimport numpy as np | ||||
| import cytoolz | ||||
|  | @ -14,17 +13,18 @@ import ujson | |||
| import msgpack | ||||
| 
 | ||||
| from thinc.api import add, layerize, chain, clone, concatenate, with_flatten | ||||
| from thinc.neural import Model, Maxout, Softmax, Affine | ||||
| from thinc.neural._classes.hash_embed import HashEmbed | ||||
| from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||
| from thinc.i2v import HashEmbed | ||||
| from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool | ||||
| from thinc.t2t import ExtractWindow, ParametricAttention | ||||
| from thinc.misc import Residual | ||||
| from thinc.misc import BatchNorm as BN | ||||
| from thinc.misc import LayerNorm as LN | ||||
| 
 | ||||
| from thinc.neural.util import to_categorical | ||||
| 
 | ||||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool | ||||
| from thinc.neural._classes.difference import Siamese, CauchySimilarity | ||||
| 
 | ||||
| from thinc.neural._classes.convolution import ExtractWindow | ||||
| from thinc.neural._classes.resnet import Residual | ||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | ||||
| 
 | ||||
| from .tokens.doc cimport Doc | ||||
| from .syntax.parser cimport Parser as LinearParser | ||||
| from .syntax.nn_parser cimport Parser as NeuralParser | ||||
|  | @ -41,13 +41,14 @@ from .syntax import nonproj | |||
| from .compat import json_dumps | ||||
| 
 | ||||
| from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | ||||
| from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats | ||||
| from ._ml import rebatch, Tok2Vec, flatten | ||||
| from ._ml import build_text_classifier, build_tagger_model | ||||
| from ._ml import link_vectors_to_models | ||||
| from .parts_of_speech import X | ||||
| 
 | ||||
| 
 | ||||
| class SentenceSegmenter(object): | ||||
|     '''A simple spaCy hook, to allow custom sentence boundary detection logic | ||||
|     """A simple spaCy hook, to allow custom sentence boundary detection logic | ||||
|     (that doesn't require the dependency parse). | ||||
| 
 | ||||
|     To change the sentence boundary detection strategy, pass a generator | ||||
|  | @ -56,7 +57,7 @@ class SentenceSegmenter(object): | |||
| 
 | ||||
|     Sentence detection strategies should be generators that take `Doc` objects | ||||
|     and yield `Span` objects for each sentence. | ||||
|     ''' | ||||
|     """ | ||||
|     name = 'sbd' | ||||
| 
 | ||||
|     def __init__(self, vocab, strategy=None): | ||||
|  | @ -88,17 +89,30 @@ class BaseThincComponent(object): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, *shape, **kwargs): | ||||
|         """Initialize a model for the pipe.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def __init__(self, vocab, model=True, **cfg): | ||||
|         """Create a new pipe instance.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Apply the pipe to one document. The document is | ||||
|         modified in-place, and returned. | ||||
| 
 | ||||
|         Both __call__ and pipe should delegate to the `predict()` | ||||
|         and `set_annotations()` methods. | ||||
|         """ | ||||
|         scores = self.predict([doc]) | ||||
|         self.set_annotations([doc], scores) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||
|         """Apply the pipe to a stream of documents. | ||||
| 
 | ||||
|         Both __call__ and pipe should delegate to the `predict()` | ||||
|         and `set_annotations()` methods. | ||||
|         """ | ||||
|         for docs in cytoolz.partition_all(batch_size, stream): | ||||
|             docs = list(docs) | ||||
|             scores = self.predict(docs) | ||||
|  | @ -106,27 +120,43 @@ class BaseThincComponent(object): | |||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         """Apply the pipeline's model to a batch of docs, without | ||||
|         modifying them. | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def set_annotations(self, docs, scores): | ||||
|         """Modify a batch of documents, using pre-computed scores.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         """Learn from a batch of documents and gold-standard information, | ||||
|         updating the pipe's model. | ||||
| 
 | ||||
|         Delegates to predict() and get_loss(). | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         """Find the loss and gradient of loss for the batch of | ||||
|         documents and their predicted scores.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): | ||||
|         token_vector_width = pipeline[0].model.nO | ||||
|         """Initialize the pipe for training, using data exampes if available. | ||||
|         If no model has been initialized yet, the model is added.""" | ||||
|         if self.model is True: | ||||
|             self.model = self.Model(1, token_vector_width) | ||||
|             self.model = self.Model(**self.cfg) | ||||
|         link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
|     def use_params(self, params): | ||||
|         """Modify the pipe's model, to use the given parameter values. | ||||
|         """ | ||||
|         with self.model.use_params(params): | ||||
|             yield | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         """Serialize the pipe to a bytestring.""" | ||||
|         serialize = OrderedDict(( | ||||
|             ('cfg', lambda: json_dumps(self.cfg)), | ||||
|             ('model', lambda: self.model.to_bytes()), | ||||
|  | @ -135,37 +165,42 @@ class BaseThincComponent(object): | |||
|         return util.to_bytes(serialize, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, bytes_data, **exclude): | ||||
|         """Load the pipe from a bytestring.""" | ||||
|         def load_model(b): | ||||
|             if self.model is True: | ||||
|                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|                 self.model = self.Model(**self.cfg) | ||||
|             self.model.from_bytes(b) | ||||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('model', load_model), | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)) | ||||
|         )) | ||||
|         util.from_bytes(bytes_data, deserialize, exclude) | ||||
|         return self | ||||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         """Serialize the pipe to disk.""" | ||||
|         serialize = OrderedDict(( | ||||
|             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||
|             ('model', lambda p: p.open('wb').write(self.model.to_bytes())), | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)) | ||||
|         )) | ||||
|         util.to_disk(path, serialize, exclude) | ||||
| 
 | ||||
|     def from_disk(self, path, **exclude): | ||||
|         """Load the pipe from disk.""" | ||||
|         def load_model(p): | ||||
|             if self.model is True: | ||||
|                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|                 self.model = self.Model(**self.cfg) | ||||
|             self.model.from_bytes(p.open('rb').read()) | ||||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), | ||||
|             ('model', load_model), | ||||
|             ('vocab', lambda p: self.vocab.from_disk(p)), | ||||
|             ('model', load_model), | ||||
|         )) | ||||
|         util.from_disk(path, deserialize, exclude) | ||||
|         return self | ||||
|  | @ -193,7 +228,7 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|         """ | ||||
|         width = util.env_opt('token_vector_width', width) | ||||
|         embed_size = util.env_opt('embed_size', embed_size) | ||||
|         return Tok2Vec(width, embed_size, preprocess=None) | ||||
|         return Tok2Vec(width, embed_size, **cfg) | ||||
| 
 | ||||
|     def __init__(self, vocab, model=True, **cfg): | ||||
|         """Construct a new statistical model. Weights are not allocated on | ||||
|  | @ -210,9 +245,10 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|             >>> tok2vec.model = tok2vec.Model(128, 5000) | ||||
|         """ | ||||
|         self.vocab = vocab | ||||
|         self.doc2feats = doc2feats() | ||||
|         self.model = model | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 3) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM | ||||
|  | @ -245,8 +281,7 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|         docs (iterable): A sequence of `Doc` objects. | ||||
|         RETURNS (object): Vector representations for each token in the documents. | ||||
|         """ | ||||
|         feats = self.doc2feats(docs) | ||||
|         tokvecs = self.model(feats) | ||||
|         tokvecs = self.model(docs) | ||||
|         return tokvecs | ||||
| 
 | ||||
|     def set_annotations(self, docs, tokvecses): | ||||
|  | @ -270,8 +305,7 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|         """ | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         feats = self.doc2feats(docs) | ||||
|         tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) | ||||
|         tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop) | ||||
|         return tokvecs, bp_tokvecs | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|  | @ -285,9 +319,10 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|         gold_tuples (iterable): Gold-standard training data. | ||||
|         pipeline (list): The pipeline the model is part of. | ||||
|         """ | ||||
|         self.doc2feats = doc2feats() | ||||
|         if self.model is True: | ||||
|             self.model = self.Model() | ||||
|             self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|             self.model = self.Model(**self.cfg) | ||||
|         link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
| 
 | ||||
| class NeuralTagger(BaseThincComponent): | ||||
|  | @ -296,29 +331,29 @@ class NeuralTagger(BaseThincComponent): | |||
|         self.vocab = vocab | ||||
|         self.model = model | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||
|         self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         tags = self.predict(([doc], [doc.tensor])) | ||||
|         tags = self.predict([doc]) | ||||
|         self.set_annotations([doc], tags) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||
|         for docs in cytoolz.partition_all(batch_size, stream): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [d.tensor for d in docs] | ||||
|             tag_ids = self.predict((docs, tokvecs)) | ||||
|             tag_ids = self.predict(docs) | ||||
|             self.set_annotations(docs, tag_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs_tokvecs): | ||||
|         scores = self.model(docs_tokvecs) | ||||
|     def predict(self, docs): | ||||
|         scores = self.model(docs) | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         if not isinstance(guesses, numpy.ndarray): | ||||
|             guesses = guesses.get() | ||||
|         tokvecs = docs_tokvecs[1] | ||||
|         guesses = self.model.ops.unflatten(guesses, | ||||
|                     [tv.shape[0] for tv in tokvecs]) | ||||
|                     [len(d) for d in docs]) | ||||
|         return guesses | ||||
| 
 | ||||
|     def set_annotations(self, docs, batch_tag_ids): | ||||
|  | @ -338,20 +373,16 @@ class NeuralTagger(BaseThincComponent): | |||
|                 idx += 1 | ||||
|         doc.is_tagged = True | ||||
| 
 | ||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         docs, tokvecs = docs_tokvecs | ||||
| 
 | ||||
|         if self.model.nI is None: | ||||
|             self.model.nI = tokvecs[0].shape[1] | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop) | ||||
|         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) | ||||
|         bp_tag_scores(d_tag_scores, sgd=sgd) | ||||
| 
 | ||||
|         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) | ||||
|         if losses is not None: | ||||
|             losses[self.name] += loss | ||||
|         return d_tokvecs | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|  | @ -392,13 +423,14 @@ class NeuralTagger(BaseThincComponent): | |||
|             vocab.morphology = Morphology(vocab.strings, new_tag_map, | ||||
|                                           vocab.morphology.lemmatizer, | ||||
|                                           exc=vocab.morphology.exc) | ||||
|         token_vector_width = pipeline[0].model.nO | ||||
|         if self.model is True: | ||||
|             self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) | ||||
|             self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||
|             self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|         link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|     def Model(cls, n_tags, **cfg): | ||||
|         return build_tagger_model(n_tags, **cfg) | ||||
| 
 | ||||
|     def use_params(self, params): | ||||
|         with self.model.use_params(params): | ||||
|  | @ -419,7 +451,7 @@ class NeuralTagger(BaseThincComponent): | |||
|             if self.model is True: | ||||
|                 token_vector_width = util.env_opt('token_vector_width', | ||||
|                         self.cfg.get('token_vector_width', 128)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|             self.model.from_bytes(b) | ||||
| 
 | ||||
|         def load_tag_map(b): | ||||
|  | @ -438,6 +470,7 @@ class NeuralTagger(BaseThincComponent): | |||
|         return self | ||||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||
|         serialize = OrderedDict(( | ||||
|             ('vocab', lambda p: self.vocab.to_disk(p)), | ||||
|             ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( | ||||
|  | @ -452,9 +485,7 @@ class NeuralTagger(BaseThincComponent): | |||
|     def from_disk(self, path, **exclude): | ||||
|         def load_model(p): | ||||
|             if self.model is True: | ||||
|                 token_vector_width = util.env_opt('token_vector_width', | ||||
|                         self.cfg.get('token_vector_width', 128)) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) | ||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg) | ||||
|             self.model.from_bytes(p.open('rb').read()) | ||||
| 
 | ||||
|         def load_tag_map(p): | ||||
|  | @ -466,10 +497,10 @@ class NeuralTagger(BaseThincComponent): | |||
|                 exc=self.vocab.morphology.exc) | ||||
| 
 | ||||
|         deserialize = OrderedDict(( | ||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))), | ||||
|             ('vocab', lambda p: self.vocab.from_disk(p)), | ||||
|             ('tag_map', load_tag_map), | ||||
|             ('model', load_model), | ||||
|             ('cfg', lambda p: self.cfg.update(_load_cfg(p))) | ||||
|         )) | ||||
|         util.from_disk(path, deserialize, exclude) | ||||
|         return self | ||||
|  | @ -477,10 +508,28 @@ class NeuralTagger(BaseThincComponent): | |||
| 
 | ||||
| class NeuralLabeller(NeuralTagger): | ||||
|     name = 'nn_labeller' | ||||
|     def __init__(self, vocab, model=True, **cfg): | ||||
|     def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): | ||||
|         self.vocab = vocab | ||||
|         self.model = model | ||||
|         if target == 'dep': | ||||
|             self.make_label = self.make_dep | ||||
|         elif target == 'tag': | ||||
|             self.make_label = self.make_tag | ||||
|         elif target == 'ent': | ||||
|             self.make_label = self.make_ent | ||||
|         elif target == 'dep_tag_offset': | ||||
|             self.make_label = self.make_dep_tag_offset | ||||
|         elif target == 'ent_tag': | ||||
|             self.make_label = self.make_ent_tag | ||||
|         elif hasattr(target, '__call__'): | ||||
|             self.make_label = target | ||||
|         else: | ||||
|             raise ValueError( | ||||
|                 "NeuralLabeller target should be function or one of " | ||||
|                 "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']") | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||
|         self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) | ||||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|  | @ -493,41 +542,79 @@ class NeuralLabeller(NeuralTagger): | |||
|     def set_annotations(self, docs, dep_ids): | ||||
|         pass | ||||
| 
 | ||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None): | ||||
|     def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None): | ||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) | ||||
|         for raw_text, annots_brackets in gold_tuples: | ||||
|             for annots, brackets in annots_brackets: | ||||
|                 ids, words, tags, heads, deps, ents = annots | ||||
|                 for dep in deps: | ||||
|                     if dep not in self.labels: | ||||
|                         self.labels[dep] = len(self.labels) | ||||
|         token_vector_width = pipeline[0].model.nO | ||||
|                 for i in range(len(ids)): | ||||
|                     label = self.make_label(i, words, tags, heads, deps, ents) | ||||
|                     if label is not None and label not in self.labels: | ||||
|                         self.labels[label] = len(self.labels) | ||||
|         print(len(self.labels)) | ||||
|         if self.model is True: | ||||
|             self.model = self.Model(len(self.labels), token_vector_width) | ||||
|             token_vector_width = util.env_opt('token_vector_width') | ||||
|             self.model = chain( | ||||
|                 tok2vec, | ||||
|                 Softmax(len(self.labels), token_vector_width) | ||||
|             ) | ||||
|         link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|     def Model(cls, n_tags, tok2vec=None, **cfg): | ||||
|         return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg) | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         for gold in golds: | ||||
|             for tag in gold.labels: | ||||
|                 if tag is None or tag not in self.labels: | ||||
|             for i in range(len(gold.labels)): | ||||
|                 label = self.make_label(i, gold.words, gold.tags, gold.heads, | ||||
|                                         gold.labels, gold.ents) | ||||
|                 if label is None or label not in self.labels: | ||||
|                     correct[idx] = guesses[idx] | ||||
|                 else: | ||||
|                     correct[idx] = self.labels[tag] | ||||
|                     correct[idx] = self.labels[label] | ||||
|                 idx += 1 | ||||
|         correct = self.model.ops.xp.array(correct, dtype='i') | ||||
|         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) | ||||
|         d_scores /= d_scores.shape[0] | ||||
|         loss = (d_scores**2).sum() | ||||
|         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) | ||||
|         return float(loss), d_scores | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_dep(i, words, tags, heads, deps, ents): | ||||
|         if deps[i] is None or heads[i] is None: | ||||
|             return None | ||||
|         return deps[i] | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_tag(i, words, tags, heads, deps, ents): | ||||
|         return tags[i] | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_ent(i, words, tags, heads, deps, ents): | ||||
|         if ents is None: | ||||
|             return None | ||||
|         return ents[i] | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_dep_tag_offset(i, words, tags, heads, deps, ents): | ||||
|         if deps[i] is None or heads[i] is None: | ||||
|             return None | ||||
|         offset = heads[i] - i | ||||
|         offset = min(offset, 2) | ||||
|         offset = max(offset, -2) | ||||
|         return '%s-%s:%d' % (deps[i], tags[i], offset) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def make_ent_tag(i, words, tags, heads, deps, ents): | ||||
|         if ents is None or ents[i] is None: | ||||
|             return None | ||||
|         else: | ||||
|             return '%s-%s' % (tags[i], ents[i]) | ||||
| 
 | ||||
| 
 | ||||
| class SimilarityHook(BaseThincComponent): | ||||
|     """ | ||||
|  | @ -555,7 +642,7 @@ class SimilarityHook(BaseThincComponent): | |||
|         return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         '''Install similarity hook''' | ||||
|         """Install similarity hook""" | ||||
|         doc.user_hooks['similarity'] = self.predict | ||||
|         return doc | ||||
| 
 | ||||
|  | @ -564,15 +651,10 @@ class SimilarityHook(BaseThincComponent): | |||
|             yield self(doc) | ||||
| 
 | ||||
|     def predict(self, doc1, doc2): | ||||
|         return self.model.predict([(doc1.tensor, doc2.tensor)]) | ||||
|         return self.model.predict([(doc1, doc2)]) | ||||
| 
 | ||||
|     def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.): | ||||
|         doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2 | ||||
|         sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s), | ||||
|                                                 drop=drop) | ||||
|         d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd) | ||||
| 
 | ||||
|         return d_tensor1s, d_tensor2s | ||||
|     def update(self, doc1_doc2, golds, sgd=None, drop=0.): | ||||
|         sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) | ||||
| 
 | ||||
|     def begin_training(self, _=tuple(), pipeline=None): | ||||
|         """ | ||||
|  | @ -583,6 +665,7 @@ class SimilarityHook(BaseThincComponent): | |||
|         """ | ||||
|         if self.model is True: | ||||
|             self.model = self.Model(pipeline[0].model.nO) | ||||
|             link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
| 
 | ||||
| class TextCategorizer(BaseThincComponent): | ||||
|  | @ -627,15 +710,13 @@ class TextCategorizer(BaseThincComponent): | |||
|             for j, label in enumerate(self.labels): | ||||
|                 doc.cats[label] = float(scores[i, j]) | ||||
| 
 | ||||
|     def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): | ||||
|         docs, tensors = docs_tensors | ||||
|     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): | ||||
|         scores, bp_scores = self.model.begin_update(docs, drop=drop) | ||||
|         loss, d_scores = self.get_loss(docs, golds, scores) | ||||
|         d_tensors = bp_scores(d_scores, sgd=sgd) | ||||
|         bp_scores(d_scores, sgd=sgd) | ||||
|         if losses is not None: | ||||
|             losses.setdefault(self.name, 0.0) | ||||
|             losses[self.name] += loss | ||||
|         return d_tensors | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') | ||||
|  | @ -653,8 +734,10 @@ class TextCategorizer(BaseThincComponent): | |||
|         else: | ||||
|             token_vector_width = 64 | ||||
|         if self.model is True: | ||||
|             self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|             self.model = self.Model(len(self.labels), token_vector_width, | ||||
|                                     **self.cfg) | ||||
|             link_vectors_to_models(self.vocab) | ||||
| 
 | ||||
| 
 | ||||
| cdef class EntityRecognizer(LinearParser): | ||||
|  | @ -695,6 +778,14 @@ cdef class NeuralDependencyParser(NeuralParser): | |||
|     name = 'parser' | ||||
|     TransitionSystem = ArcEager | ||||
| 
 | ||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||
|         for target in []: | ||||
|             labeller = NeuralLabeller(self.vocab, target=target) | ||||
|             tok2vec = self.model[0] | ||||
|             labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) | ||||
|             pipeline.append(labeller) | ||||
|             self._multitasks.append(labeller) | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) | ||||
| 
 | ||||
|  | @ -705,13 +796,13 @@ cdef class NeuralEntityRecognizer(NeuralParser): | |||
| 
 | ||||
|     nr_feature = 6 | ||||
| 
 | ||||
|     def predict_confidences(self, docs): | ||||
|         tensors = [d.tensor for d in docs] | ||||
|         samples = [] | ||||
|         for i in range(10): | ||||
|             states = self.parse_batch(docs, tensors, drop=0.3) | ||||
|             for state in states: | ||||
|                 samples.append(self._get_entities(state)) | ||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||
|         for target in []: | ||||
|             labeller = NeuralLabeller(self.vocab, target=target) | ||||
|             tok2vec = self.model[0] | ||||
|             labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec) | ||||
|             pipeline.append(labeller) | ||||
|             self._multitasks.append(labeller) | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| cpdef enum symbol_t: | ||||
| cdef enum symbol_t: | ||||
|     NIL | ||||
|     IS_ALPHA | ||||
|     IS_ASCII | ||||
|  |  | |||
|  | @ -1,4 +1,6 @@ | |||
| # coding: utf8 | ||||
| #cython: optimize.unpack_method_calls=False | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| IDS = { | ||||
|  | @ -458,4 +460,11 @@ IDS = { | |||
|     "xcomp": xcomp | ||||
| } | ||||
| 
 | ||||
| NAMES = [it[0] for it in sorted(IDS.items(), key=lambda it: it[1])] | ||||
| def sort_nums(x): | ||||
|     return x[1] | ||||
| 
 | ||||
| NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] | ||||
| # Unfortunate hack here, to work around problem with long cpdef enum | ||||
| # (which is generating an enormous amount of C++ in Cython 0.24+) | ||||
| # We keep the enum cdef, and just make sure the names are available to Python | ||||
| locals().update(IDS) | ||||
|  |  | |||
|  | @ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens): | |||
| 
 | ||||
| nr_update = 0 | ||||
| def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||
|                 states, tokvecs, golds, | ||||
|                 states, golds, | ||||
|                 state2vec, vec2scores,  | ||||
|                 int width, float density, | ||||
|                 sgd=None, losses=None, drop=0.): | ||||
|                 losses=None, drop=0.): | ||||
|     global nr_update | ||||
|     cdef MaxViolation violn | ||||
|     nr_update += 1 | ||||
|  |  | |||
|  | @ -101,9 +101,10 @@ cdef cppclass StateC: | |||
|         elif n == 6: | ||||
|             if this.B(0) >= 0: | ||||
|                 ids[0] = this.B(0) | ||||
|                 ids[1] = this.B(0)-1 | ||||
|             else: | ||||
|                 ids[0] = -1 | ||||
|             ids[1] = this.B(0) | ||||
|                 ids[1] = -1 | ||||
|             ids[2] = this.B(1) | ||||
|             ids[3] = this.E(0) | ||||
|             if ids[3] >= 1: | ||||
|  | @ -120,6 +121,8 @@ cdef cppclass StateC: | |||
|         for i in range(n): | ||||
|             if ids[i] >= 0: | ||||
|                 ids[i] += this.offset | ||||
|             else: | ||||
|                 ids[i] = -1 | ||||
| 
 | ||||
|     int S(int i) nogil const: | ||||
|         if i >= this._s_i: | ||||
|  | @ -162,9 +165,9 @@ cdef cppclass StateC: | |||
| 
 | ||||
|     int E(int i) nogil const: | ||||
|         if this._e_i <= 0 or this._e_i >= this.length: | ||||
|             return 0 | ||||
|             return -1 | ||||
|         if i < 0 or i >= this._e_i: | ||||
|             return 0 | ||||
|             return -1 | ||||
|         return this._ents[this._e_i - (i+1)].start | ||||
| 
 | ||||
|     int L(int i, int idx) nogil const: | ||||
|  |  | |||
|  | @ -161,8 +161,7 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|     cdef Transition lookup_transition(self, object name) except *: | ||||
|         cdef attr_t label | ||||
|         if name == '-' or name == None: | ||||
|             move_str = 'M' | ||||
|             label = 0 | ||||
|             return Transition(clas=0, move=MISSING, label=0, score=0) | ||||
|         elif name == '!O': | ||||
|             return Transition(clas=0, move=ISNT, label=0, score=0) | ||||
|         elif '-' in name: | ||||
|  | @ -220,6 +219,31 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             raise Exception(move) | ||||
|         return t | ||||
| 
 | ||||
|     #def add_action(self, int action, label_name): | ||||
|     #    cdef attr_t label_id | ||||
|     #    if not isinstance(label_name, (int, long)): | ||||
|     #        label_id = self.strings.add(label_name) | ||||
|     #    else: | ||||
|     #        label_id = label_name | ||||
|     #    if action == OUT and label_id != 0: | ||||
|     #        return | ||||
|     #    if action == MISSING or action == ISNT: | ||||
|     #        return | ||||
|     #    # Check we're not creating a move we already have, so that this is | ||||
|     #    # idempotent | ||||
|     #    for trans in self.c[:self.n_moves]: | ||||
|     #        if trans.move == action and trans.label == label_id: | ||||
|     #            return 0 | ||||
|     #    if self.n_moves >= self._size: | ||||
|     #        self._size *= 2 | ||||
|     #        self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||
|     #    self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||
|     #    assert self.c[self.n_moves].label == label_id | ||||
|     #    self.n_moves += 1 | ||||
|     #    return 1 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     cdef int initialize_state(self, StateC* st) nogil: | ||||
|         # This is especially necessary when we use limited training data. | ||||
|         for i in range(st.length): | ||||
|  |  | |||
|  | @ -13,6 +13,7 @@ cdef class Parser: | |||
|     cdef public object model | ||||
|     cdef readonly TransitionSystem moves | ||||
|     cdef readonly object cfg | ||||
|     cdef public object _multitasks | ||||
| 
 | ||||
|     cdef void _parse_step(self, StateC* state, | ||||
|             const float* feat_weights, | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function | |||
| 
 | ||||
| from collections import Counter, OrderedDict | ||||
| import ujson | ||||
| import json | ||||
| import contextlib | ||||
| 
 | ||||
| from libc.math cimport exp | ||||
|  | @ -37,10 +38,9 @@ from preshed.maps cimport MapStruct | |||
| from preshed.maps cimport map_get | ||||
| 
 | ||||
| from thinc.api import layerize, chain, noop, clone, with_flatten | ||||
| from thinc.neural import Model, Affine, ReLu, Maxout | ||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | ||||
| from thinc.neural._classes.selu import SELU | ||||
| from thinc.neural._classes.layernorm import LayerNorm | ||||
| from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU | ||||
| from thinc.misc import LayerNorm | ||||
| 
 | ||||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
|  | @ -48,7 +48,8 @@ from .. import util | |||
| from ..util import get_async, get_cuda_stream | ||||
| from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts | ||||
| from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune | ||||
| from .._ml import Residual, drop_layer | ||||
| from .._ml import Residual, drop_layer, flatten | ||||
| from .._ml import link_vectors_to_models | ||||
| from ..compat import json_dumps | ||||
| 
 | ||||
| from . import _parse_features | ||||
|  | @ -238,14 +239,15 @@ cdef class Parser: | |||
|     Base class of the DependencyParser and EntityRecognizer. | ||||
|     """ | ||||
|     @classmethod | ||||
|     def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): | ||||
|     def Model(cls, nr_class, token_vector_width=128, hidden_width=200, depth=1, **cfg): | ||||
|         depth = util.env_opt('parser_hidden_depth', depth) | ||||
|         token_vector_width = util.env_opt('token_vector_width', token_vector_width) | ||||
|         hidden_width = util.env_opt('hidden_width', hidden_width) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) | ||||
|         embed_size = util.env_opt('embed_size', 4000) | ||||
|         tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, | ||||
|                                     preprocess=doc2feats())) | ||||
|         embed_size = util.env_opt('embed_size', 7000) | ||||
|         tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||
|                           pretrained_dims=cfg.get('pretrained_dims', 0)) | ||||
|         tok2vec = chain(tok2vec, flatten) | ||||
|         if parser_maxout_pieces == 1: | ||||
|             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, | ||||
|                         nF=cls.nr_feature, | ||||
|  | @ -262,8 +264,8 @@ cdef class Parser: | |||
|                 upper.is_noop = True | ||||
|             else: | ||||
|                 upper = chain( | ||||
|                     clone(Maxout(hidden_width), (depth-1)), | ||||
|                     zero_init(Affine(nr_class, drop_factor=0.0)) | ||||
|                     clone(Maxout(hidden_width), depth-1), | ||||
|                     zero_init(Affine(nr_class, hidden_width, drop_factor=0.0)) | ||||
|                 ) | ||||
|                 upper.is_noop = False | ||||
|         # TODO: This is an unfortunate hack atm! | ||||
|  | @ -277,7 +279,7 @@ cdef class Parser: | |||
|             'hidden_width': hidden_width, | ||||
|             'maxout_pieces': parser_maxout_pieces | ||||
|         } | ||||
|         return (tensors, lower, upper), cfg | ||||
|         return (tok2vec, lower, upper), cfg | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, moves=True, model=True, **cfg): | ||||
|         """ | ||||
|  | @ -307,12 +309,16 @@ cdef class Parser: | |||
|             cfg['beam_width'] = util.env_opt('beam_width', 1) | ||||
|         if 'beam_density' not in cfg: | ||||
|             cfg['beam_density'] = util.env_opt('beam_density', 0.0) | ||||
|         if 'pretrained_dims' not in cfg: | ||||
|             cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] | ||||
|         cfg.setdefault('cnn_maxout_pieces', 3) | ||||
|         self.cfg = cfg | ||||
|         if 'actions' in self.cfg: | ||||
|             for action, labels in self.cfg.get('actions', {}).items(): | ||||
|                 for label in labels: | ||||
|                     self.moves.add_action(action, label) | ||||
|         self.model = model | ||||
|         self._multitasks = [] | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (Parser, (self.vocab, self.moves, self.model), None, None) | ||||
|  | @ -332,11 +338,11 @@ cdef class Parser: | |||
|             beam_density = self.cfg.get('beam_density', 0.0) | ||||
|         cdef Beam beam | ||||
|         if beam_width == 1: | ||||
|             states = self.parse_batch([doc], [doc.tensor]) | ||||
|             states = self.parse_batch([doc]) | ||||
|             self.set_annotations([doc], states) | ||||
|             return doc | ||||
|         else: | ||||
|             beam = self.beam_parse([doc], [doc.tensor], | ||||
|             beam = self.beam_parse([doc], | ||||
|                         beam_width=beam_width, beam_density=beam_density)[0] | ||||
|             output = self.moves.get_beam_annot(beam) | ||||
|             state = <StateClass>beam.at(0) | ||||
|  | @ -365,11 +371,11 @@ cdef class Parser: | |||
|         cdef Beam beam | ||||
|         for docs in cytoolz.partition_all(batch_size, docs): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [doc.tensor for doc in docs] | ||||
|             if beam_width == 1: | ||||
|                 parse_states = self.parse_batch(docs, tokvecs) | ||||
|                 parse_states = self.parse_batch(docs) | ||||
|                 beams = [] | ||||
|             else: | ||||
|                 beams = self.beam_parse(docs, tokvecs, | ||||
|                 beams = self.beam_parse(docs, | ||||
|                             beam_width=beam_width, beam_density=beam_density) | ||||
|                 parse_states = [] | ||||
|                 for beam in beams: | ||||
|  | @ -377,7 +383,7 @@ cdef class Parser: | |||
|             self.set_annotations(docs, parse_states) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def parse_batch(self, docs, tokvecses): | ||||
|     def parse_batch(self, docs): | ||||
|         cdef: | ||||
|             precompute_hiddens state2vec | ||||
|             StateClass state | ||||
|  | @ -388,21 +394,15 @@ cdef class Parser: | |||
|             int nr_class, nr_feat, nr_piece, nr_dim, nr_state | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         if isinstance(tokvecses, np.ndarray): | ||||
|             tokvecses = [tokvecses] | ||||
| 
 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||
|                                                                             0.0) | ||||
| 
 | ||||
|         nr_state = len(docs) | ||||
|         nr_class = self.moves.n_moves | ||||
|         nr_dim = tokvecs.shape[1] | ||||
|         nr_feat = self.nr_feature | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs, | ||||
|                                                      cuda_stream, 0.0) | ||||
|         nr_piece = state2vec.nP | ||||
| 
 | ||||
|         states = self.moves.init_batch(docs) | ||||
|  | @ -418,21 +418,23 @@ cdef class Parser: | |||
|         c_token_ids = <int*>token_ids.data | ||||
|         c_is_valid = <int*>is_valid.data | ||||
|         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) | ||||
|         cdef int nr_step | ||||
|         while not next_step.empty(): | ||||
|             nr_step = next_step.size() | ||||
|             if not has_hidden: | ||||
|                 for i in cython.parallel.prange( | ||||
|                         next_step.size(), num_threads=6, nogil=True): | ||||
|                 for i in cython.parallel.prange(nr_step, num_threads=6, | ||||
|                                                 nogil=True): | ||||
|                     self._parse_step(next_step[i], | ||||
|                         feat_weights, nr_class, nr_feat, nr_piece) | ||||
|             else: | ||||
|                 for i in range(next_step.size()): | ||||
|                 for i in range(nr_step): | ||||
|                     st = next_step[i] | ||||
|                     st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||
|                     self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||
|                 vectors = state2vec(token_ids[:next_step.size()]) | ||||
|                 scores = vec2scores(vectors) | ||||
|                 c_scores = <float*>scores.data | ||||
|                 for i in range(next_step.size()): | ||||
|                 for i in range(nr_step): | ||||
|                     st = next_step[i] | ||||
|                     guess = arg_max_if_valid( | ||||
|                         &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||
|  | @ -445,18 +447,15 @@ cdef class Parser: | |||
|                     next_step.push_back(st) | ||||
|         return states | ||||
| 
 | ||||
|     def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): | ||||
|     def beam_parse(self, docs, int beam_width=3, float beam_density=0.001): | ||||
|         cdef Beam beam | ||||
|         cdef np.ndarray scores | ||||
|         cdef Doc doc | ||||
|         cdef int nr_class = self.moves.n_moves | ||||
|         cdef StateClass stcls, output | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, | ||||
|                                                      cuda_stream, 0.0) | ||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||
|                                                                             0.0) | ||||
|         beams = [] | ||||
|         cdef int offset = 0 | ||||
|         cdef int j = 0 | ||||
|  | @ -516,29 +515,24 @@ cdef class Parser: | |||
|         free(scores) | ||||
|         free(token_ids) | ||||
| 
 | ||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         if not any(self.moves.has_gold(gold) for gold in golds): | ||||
|             return None | ||||
|         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: | ||||
|             return self.update_beam(docs_tokvecs, golds, | ||||
|             return self.update_beam(docs, golds, | ||||
|                     self.cfg['beam_width'], self.cfg['beam_density'], | ||||
|                     drop=drop, sgd=sgd, losses=losses) | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         docs, tokvec_lists = docs_tokvecs | ||||
|         tokvecs = self.model[0].ops.flatten(tokvec_lists) | ||||
|         if isinstance(docs, Doc) and isinstance(golds, GoldParse): | ||||
|             docs = [docs] | ||||
|             golds = [golds] | ||||
|         if USE_FINE_TUNE: | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs = self.model[0].ops.flatten(my_tokvecs) | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
| 
 | ||||
|         states, golds, max_steps = self._init_gold_batch(docs, golds) | ||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, | ||||
|                                                       0.0) | ||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, | ||||
|                                                                             drop) | ||||
|         todo = [(s, g) for (s, g) in zip(states, golds) | ||||
|                 if not s.is_final() and g is not None] | ||||
|         if not todo: | ||||
|  | @ -582,13 +576,9 @@ cdef class Parser: | |||
|             if n_steps >= max_steps: | ||||
|                 break | ||||
|         self._make_updates(d_tokvecs, | ||||
|             backprops, sgd, cuda_stream) | ||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
|         if USE_FINE_TUNE: | ||||
|             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) | ||||
|         return d_tokvecs | ||||
|             bp_tokvecs, backprops, sgd, cuda_stream) | ||||
| 
 | ||||
|     def update_beam(self, docs_tokvecs, golds, width=None, density=None, | ||||
|     def update_beam(self, docs, golds, width=None, density=None, | ||||
|             drop=0., sgd=None, losses=None): | ||||
|         if not any(self.moves.has_gold(gold) for gold in golds): | ||||
|             return None | ||||
|  | @ -600,26 +590,20 @@ cdef class Parser: | |||
|             density = self.cfg.get('beam_density', 0.0) | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         docs, tokvecs = docs_tokvecs | ||||
|         lengths = [len(d) for d in docs] | ||||
|         assert min(lengths) >= 1 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecs) | ||||
|         if USE_FINE_TUNE: | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             tokvecs += self.model[0].ops.flatten(my_tokvecs) | ||||
| 
 | ||||
|         states = self.moves.init_batch(docs) | ||||
|         for gold in golds: | ||||
|             self.moves.preprocess_gold(gold) | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) | ||||
|         (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) | ||||
| 
 | ||||
|         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, | ||||
|                                         states, tokvecs, golds, | ||||
|                                         states, golds, | ||||
|                                         state2vec, vec2scores, | ||||
|                                         width, density, | ||||
|                                         sgd=sgd, drop=drop, losses=losses) | ||||
|                                         drop=drop, losses=losses) | ||||
|         backprop_lower = [] | ||||
|         cdef float batch_size = len(docs) | ||||
|         for i, d_scores in enumerate(states_d_scores): | ||||
|  | @ -637,11 +621,7 @@ cdef class Parser: | |||
|             else: | ||||
|                 backprop_lower.append((ids, d_vector, bp_vectors)) | ||||
|         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) | ||||
|         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) | ||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) | ||||
|         if USE_FINE_TUNE: | ||||
|             d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd) | ||||
|         return d_tokvecs | ||||
|         self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) | ||||
| 
 | ||||
|     def _init_gold_batch(self, whole_docs, whole_golds): | ||||
|         """Make a square batch, of length equal to the shortest doc. A long | ||||
|  | @ -679,7 +659,7 @@ cdef class Parser: | |||
|             max_moves = max(max_moves, len(oracle_actions)) | ||||
|         return states, golds, max_moves | ||||
| 
 | ||||
|     def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): | ||||
|     def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None): | ||||
|         # Tells CUDA to block, so our async copies complete. | ||||
|         if cuda_stream is not None: | ||||
|             cuda_stream.synchronize() | ||||
|  | @ -690,6 +670,7 @@ cdef class Parser: | |||
|             d_state_features *= mask.reshape(ids.shape + (1,)) | ||||
|             self.model[0].ops.scatter_add(d_tokvecs, ids * mask, | ||||
|                 d_state_features) | ||||
|         bp_tokvecs(d_tokvecs, sgd=sgd) | ||||
| 
 | ||||
|     @property | ||||
|     def move_names(self): | ||||
|  | @ -699,11 +680,12 @@ cdef class Parser: | |||
|             names.append(name) | ||||
|         return names | ||||
| 
 | ||||
|     def get_batch_model(self, batch_size, tokvecs, stream, dropout): | ||||
|         _, lower, upper = self.model | ||||
|         state2vec = precompute_hiddens(batch_size, tokvecs, | ||||
|                         lower, stream, drop=dropout) | ||||
|         return state2vec, upper | ||||
|     def get_batch_model(self, docs, stream, dropout): | ||||
|         tok2vec, lower, upper = self.model | ||||
|         tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout) | ||||
|         state2vec = precompute_hiddens(len(docs), tokvecs, | ||||
|                                        lower, stream, drop=0.0) | ||||
|         return (tokvecs, bp_tokvecs), state2vec, upper | ||||
| 
 | ||||
|     nr_feature = 8 | ||||
| 
 | ||||
|  | @ -766,7 +748,7 @@ cdef class Parser: | |||
|                 # order, or the model goes out of synch | ||||
|                 self.cfg.setdefault('extra_labels', []).append(label) | ||||
| 
 | ||||
|     def begin_training(self, gold_tuples, **cfg): | ||||
|     def begin_training(self, gold_tuples, pipeline=None, **cfg): | ||||
|         if 'model' in cfg: | ||||
|             self.model = cfg['model'] | ||||
|         gold_tuples = nonproj.preprocess_training_data(gold_tuples) | ||||
|  | @ -775,9 +757,22 @@ cdef class Parser: | |||
|             for label in labels: | ||||
|                 self.moves.add_action(action, label) | ||||
|         if self.model is True: | ||||
|             cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|             self.model, cfg = self.Model(self.moves.n_moves, **cfg) | ||||
|             self.init_multitask_objectives(gold_tuples, pipeline, **cfg) | ||||
|             link_vectors_to_models(self.vocab) | ||||
|             self.cfg.update(cfg) | ||||
| 
 | ||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||
|         '''Setup models for secondary objectives, to benefit from multi-task | ||||
|         learning. This method is intended to be overridden by subclasses. | ||||
| 
 | ||||
|         For instance, the dependency parser can benefit from sharing | ||||
|         an input representation with a label prediction model. These auxiliary | ||||
|         models are discarded after training. | ||||
|         ''' | ||||
|         pass | ||||
| 
 | ||||
|     def preprocess_gold(self, docs_golds): | ||||
|         for doc, gold in docs_golds: | ||||
|             yield doc, gold | ||||
|  | @ -813,6 +808,7 @@ cdef class Parser: | |||
|         if 'model' not in exclude: | ||||
|             path = util.ensure_path(path) | ||||
|             if self.model is True: | ||||
|                 self.cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|                 self.model, cfg = self.Model(**self.cfg) | ||||
|             else: | ||||
|                 cfg = {} | ||||
|  | @ -835,7 +831,7 @@ cdef class Parser: | |||
|             ('upper_model', lambda: self.model[2].to_bytes()), | ||||
|             ('vocab', lambda: self.vocab.to_bytes()), | ||||
|             ('moves', lambda: self.moves.to_bytes(strings=False)), | ||||
|             ('cfg', lambda: ujson.dumps(self.cfg)) | ||||
|             ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True)) | ||||
|         )) | ||||
|         if 'model' in exclude: | ||||
|             exclude['tok2vec_model'] = True | ||||
|  | @ -848,7 +844,7 @@ cdef class Parser: | |||
|         deserializers = OrderedDict(( | ||||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('moves', lambda b: self.moves.from_bytes(b, strings=False)), | ||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||
|             ('cfg', lambda b: self.cfg.update(json.loads(b))), | ||||
|             ('tok2vec_model', lambda b: None), | ||||
|             ('lower_model', lambda b: None), | ||||
|             ('upper_model', lambda b: None) | ||||
|  | @ -856,9 +852,11 @@ cdef class Parser: | |||
|         msg = util.from_bytes(bytes_data, deserializers, exclude) | ||||
|         if 'model' not in exclude: | ||||
|             if self.model is True: | ||||
|                 self.model, cfg = self.Model(self.moves.n_moves) | ||||
|                 self.model, cfg = self.Model(**self.cfg) | ||||
|                 cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|             else: | ||||
|                 cfg = {} | ||||
|             cfg['pretrained_dims'] = self.vocab.vectors_length | ||||
|             if 'tok2vec_model' in msg: | ||||
|                 self.model[0].from_bytes(msg['tok2vec_model']) | ||||
|             if 'lower_model' in msg: | ||||
|  |  | |||
|  | @ -148,7 +148,7 @@ cdef class TransitionSystem: | |||
| 
 | ||||
|     def add_action(self, int action, label_name): | ||||
|         cdef attr_t label_id | ||||
|         if not isinstance(label_name, int): | ||||
|         if not isinstance(label_name, (int, long)): | ||||
|             label_id = self.strings.add(label_name) | ||||
|         else: | ||||
|             label_id = label_name | ||||
|  |  | |||
|  | @ -12,7 +12,7 @@ from .. import util | |||
| 
 | ||||
| 
 | ||||
| _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'id', | ||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] | ||||
|               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'th','xx'] | ||||
| _models = {'en': ['en_core_web_sm'], | ||||
|            'de': ['de_core_news_md'], | ||||
|            'fr': ['fr_depvec_web_lg'], | ||||
|  | @ -108,6 +108,11 @@ def he_tokenizer(): | |||
| def nb_tokenizer(): | ||||
|     return util.get_lang_class('nb').Defaults.create_tokenizer() | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def th_tokenizer(): | ||||
|     pythainlp = pytest.importorskip("pythainlp") | ||||
|     return util.get_lang_class('th').Defaults.create_tokenizer() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def stringstore(): | ||||
|  |  | |||
|  | @ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): | |||
|     assert len(tokens) == 4 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["blau-rot"]) | ||||
| def test_tokenizer_splits_hyphens(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) | ||||
| def test_tokenizer_splits_numeric_range(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|  | @ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): | |||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) | ||||
| def test_tokenizer_keeps_hyphens(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): | ||||
|     tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") | ||||
|     assert len(tokens) == 12 | ||||
|     assert len(tokens) == 10 | ||||
|     assert tokens[0].text == "Viele" | ||||
|     assert tokens[1].text == "Regeln" | ||||
|     assert tokens[2].text == "--" | ||||
|     assert tokens[3].text == "wie" | ||||
|     assert tokens[4].text == "die" | ||||
|     assert tokens[5].text == "Bindestrich" | ||||
|     assert tokens[6].text == "-" | ||||
|     assert tokens[7].text == "Regeln" | ||||
|     assert tokens[8].text == "--" | ||||
|     assert tokens[9].text == "sind" | ||||
|     assert tokens[10].text == "kompliziert" | ||||
|     assert tokens[5].text == "Bindestrich-Regeln" | ||||
|     assert tokens[6].text == "--" | ||||
|     assert tokens[7].text == "sind" | ||||
|     assert tokens[8].text == "kompliziert" | ||||
|  |  | |||
|  | @ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. | |||
|     assert len(tokens) == 109 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,length', [ | ||||
|     ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), | ||||
|     ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), | ||||
|     ("Kraftfahrzeug-Haftpflichtversicherung", 3), | ||||
|     ("Vakuum-Mittelfrequenz-Induktionsofen", 5) | ||||
| @pytest.mark.parametrize('text', [ | ||||
|     "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", | ||||
|     "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", | ||||
|     "Kraftfahrzeug-Haftpflichtversicherung", | ||||
|     "Vakuum-Mittelfrequenz-Induktionsofen" | ||||
|     ]) | ||||
| def test_tokenizer_handles_long_words(de_tokenizer, text, length): | ||||
| def test_tokenizer_handles_long_words(de_tokenizer, text): | ||||
|     tokens = de_tokenizer(text) | ||||
|     assert len(tokens) == length | ||||
|     assert len(tokens) == 1 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,length', [ | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/th/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										13
									
								
								spacy/tests/lang/th/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/lang/th/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,13 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| TOKENIZER_TESTS = [ | ||||
|         ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม']) | ||||
| ] | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) | ||||
| def test_thai_tokenizer(th_tokenizer, text, expected_tokens): | ||||
| 	tokens = [token.text for token in th_tokenizer(text)] | ||||
| 	assert tokens == expected_tokens | ||||
|  | @ -26,7 +26,7 @@ def arc_eager(vocab): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def tok2vec(): | ||||
|     return Tok2Vec(8, 100, preprocess=doc2feats()) | ||||
|     return Tok2Vec(8, 100) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  | @ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc): | |||
|     parser(doc) | ||||
| 
 | ||||
| 
 | ||||
| def test_update_doc(parser, tok2vec, model, doc, gold): | ||||
| def test_update_doc(parser, model, doc, gold): | ||||
|     parser.model = model | ||||
|     tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) | ||||
|     d_tokvecs = parser.update(([doc], tokvecs), [gold]) | ||||
|     assert d_tokvecs[0].shape == tokvecs[0].shape | ||||
|     def optimize(weights, gradient, key=None): | ||||
|         weights -= 0.001 * gradient | ||||
|     bp_tokvecs(d_tokvecs, sgd=optimize) | ||||
|     assert d_tokvecs[0].sum() == 0. | ||||
|     parser.update([doc], [gold], sgd=optimize) | ||||
| 
 | ||||
| 
 | ||||
| def test_predict_doc_beam(parser, tok2vec, model, doc): | ||||
|     doc.tensor = tok2vec([doc])[0] | ||||
| def test_predict_doc_beam(parser, model, doc): | ||||
|     parser.model = model | ||||
|     parser(doc, beam_width=32, beam_density=0.001) | ||||
|     for word in doc: | ||||
|         print(word.text, word.head, word.dep_) | ||||
| 
 | ||||
| 
 | ||||
| def test_update_doc_beam(parser, tok2vec, model, doc, gold): | ||||
| def test_update_doc_beam(parser, model, doc, gold): | ||||
|     parser.model = model | ||||
|     tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) | ||||
|     d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) | ||||
|     assert d_tokvecs[0].shape == tokvecs[0].shape | ||||
|     def optimize(weights, gradient, key=None): | ||||
|         weights -= 0.001 * gradient | ||||
|     bp_tokvecs(d_tokvecs, sgd=optimize) | ||||
|     assert d_tokvecs[0].sum() == 0. | ||||
|     parser.update_beam([doc], [gold], sgd=optimize) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								spacy/tests/regression/test_issue1305.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,8 @@ | |||
| import pytest | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| def test_issue1305(EN): | ||||
|     '''Test lemmatization of English VBZ''' | ||||
|     assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work']) | ||||
|     doc = EN(u'This app works well') | ||||
|     assert doc[2].lemma_ == 'work' | ||||
							
								
								
									
										14
									
								
								spacy/tests/regression/test_issue1380.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								spacy/tests/regression/test_issue1380.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,14 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| 
 | ||||
| from ...language import Language | ||||
| 
 | ||||
| def test_issue1380_empty_string(): | ||||
|     nlp = Language() | ||||
|     doc = nlp('') | ||||
|     assert len(doc) == 0 | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| def test_issue1380_en(EN): | ||||
|     doc = EN('') | ||||
|     assert len(doc) == 0 | ||||
|  | @ -13,7 +13,10 @@ def test_issue429(EN): | |||
|             return None | ||||
|         spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] | ||||
|         for ent_id, label, span in spans: | ||||
|         span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label]) | ||||
|             span.merge( | ||||
|                 tag=('NNP' if label else span.root.tag_), | ||||
|                 lemma=span.text, | ||||
|                 label='PERSON') | ||||
| 
 | ||||
|     doc = EN('a') | ||||
|     matcher = Matcher(EN.vocab) | ||||
|  |  | |||
|  | @ -11,7 +11,7 @@ import pytest | |||
| def taggers(en_vocab): | ||||
|     tagger1 = Tagger(en_vocab) | ||||
|     tagger2 = Tagger(en_vocab) | ||||
|     tagger1.model = tagger1.Model(8, 8) | ||||
|     tagger1.model = tagger1.Model(8) | ||||
|     tagger2.model = tagger1.model | ||||
|     return (tagger1, tagger2) | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,6 +6,16 @@ from ...strings import StringStore | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| def test_string_hash(stringstore): | ||||
|     '''Test that string hashing is stable across platforms''' | ||||
|     ss = stringstore | ||||
|     assert ss.add('apple') == 8566208034543834098 | ||||
|     heart = '\U0001f499' | ||||
|     print(heart) | ||||
|     h = ss.add(heart) | ||||
|     assert h == 11841826740069053588 | ||||
|   | ||||
| 
 | ||||
| def test_stringstore_from_api_docs(stringstore): | ||||
|     apple_hash = stringstore.add('apple') | ||||
|     assert apple_hash == 8566208034543834098 | ||||
|  |  | |||
|  | @ -34,7 +34,6 @@ def test_matcher_from_api_docs(en_vocab): | |||
|     assert len(patterns[0]) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_matcher_from_usage_docs(en_vocab): | ||||
|     text = "Wow 😀 This is really cool! 😂 😂" | ||||
|     doc = get_doc(en_vocab, words=text.split(' ')) | ||||
|  | @ -46,7 +45,8 @@ def test_matcher_from_usage_docs(en_vocab): | |||
|         if doc.vocab.strings[match_id] == 'HAPPY': | ||||
|             doc.sentiment += 0.1 | ||||
|         span = doc[start : end] | ||||
|         token = span.merge(norm='happy emoji') | ||||
|         token = span.merge() | ||||
|         token.vocab[token.text].norm_ = 'happy emoji' | ||||
| 
 | ||||
|     matcher = Matcher(en_vocab) | ||||
|     matcher.add('HAPPY', label_sentiment, *pos_patterns) | ||||
|  | @ -98,11 +98,11 @@ def test_matcher_match_multi(matcher): | |||
|                             (doc.vocab.strings['Java'], 5, 6)] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_matcher_phrase_matcher(en_vocab): | ||||
|     words = ["Google", "Now"] | ||||
|     doc = get_doc(en_vocab, words) | ||||
|     matcher = PhraseMatcher(en_vocab, [doc]) | ||||
|     matcher = PhraseMatcher(en_vocab) | ||||
|     matcher.add('COMPANY', None, doc) | ||||
|     words = ["I", "like", "Google", "Now", "best"] | ||||
|     doc = get_doc(en_vocab, words) | ||||
|     assert len(matcher(doc)) == 1 | ||||
|  |  | |||
|  | @ -9,7 +9,8 @@ from .util import get_doc | |||
| 
 | ||||
| from pathlib import Path | ||||
| import pytest | ||||
| from thinc.neural import Maxout, Softmax | ||||
| from thinc.neural._classes.maxout import Maxout | ||||
| from thinc.neural._classes.softmax import Softmax | ||||
| from thinc.api import chain | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import sys | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
|  | @ -37,9 +38,10 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): | |||
|     tokens = tokenizer(text) | ||||
|     assert len(tokens) == length | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), | ||||
|                                          ('i💙you', 3), ('🤘🤘yay!', 4)]) | ||||
| def test_tokenizer_handles_emoji(tokenizer, text, length): | ||||
|     # These break on narrow unicode builds, e.g. Windows | ||||
|     if sys.maxunicode >= 1114111: | ||||
|         tokens = tokenizer(text) | ||||
|         assert len(tokens) == length | ||||
|  |  | |||
|  | @ -54,7 +54,7 @@ cdef class Doc: | |||
| 
 | ||||
|     cdef public object noun_chunks_iterator | ||||
| 
 | ||||
|     cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 | ||||
|     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 | ||||
| 
 | ||||
|     cpdef np.ndarray to_array(self, object features) | ||||
| 
 | ||||
|  |  | |||
|  | @ -660,7 +660,7 @@ cdef class Doc: | |||
|         """ | ||||
|         with path.open('rb') as file_: | ||||
|             bytes_data = file_.read() | ||||
|         self.from_bytes(bytes_data, **exclude) | ||||
|         return self.from_bytes(bytes_data, **exclude) | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         """Serialize, i.e. export the document contents to a binary string. | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ from __future__ import unicode_literals, print_function | |||
| 
 | ||||
| import os | ||||
| import ujson | ||||
| import pip | ||||
| import pkg_resources | ||||
| import importlib | ||||
| import regex as re | ||||
| from pathlib import Path | ||||
|  | @ -14,6 +14,7 @@ import numpy | |||
| import io | ||||
| import dill | ||||
| from collections import OrderedDict | ||||
| from thinc.neural._classes.model import Model | ||||
| 
 | ||||
| import msgpack | ||||
| import msgpack_numpy | ||||
|  | @ -180,9 +181,10 @@ def is_package(name): | |||
|     name (unicode): Name of package. | ||||
|     RETURNS (bool): True if installed package, False if not. | ||||
|     """ | ||||
|     packages = pip.get_installed_distributions() | ||||
|     name = name.lower()  # compare package name against lowercase name | ||||
|     packages = pkg_resources.working_set.by_key.keys() | ||||
|     for package in packages: | ||||
|         if package.project_name.replace('-', '_') == name: | ||||
|         if package.lower().replace('-', '_') == name: | ||||
|             return True | ||||
|     return False | ||||
| 
 | ||||
|  | @ -193,6 +195,7 @@ def get_package_path(name): | |||
|     name (unicode): Package name. | ||||
|     RETURNS (Path): Path to installed package. | ||||
|     """ | ||||
|     name = name.lower()  # use lowercase version to be safe | ||||
|     # Here we're importing the module just to find it. This is worryingly | ||||
|     # indirect, but it's otherwise very difficult to find the package. | ||||
|     pkg = importlib.import_module(name) | ||||
|  | @ -557,3 +560,17 @@ def minify_html(html): | |||
|     RETURNS (unicode): "Minified" HTML. | ||||
|     """ | ||||
|     return html.strip().replace('    ', '').replace('\n', '') | ||||
| 
 | ||||
| 
 | ||||
| def use_gpu(gpu_id): | ||||
|     try: | ||||
|         import cupy.cuda.device | ||||
|     except ImportError: | ||||
|         return None | ||||
|     from thinc.neural.ops import CupyOps | ||||
|     device = cupy.cuda.device.Device(gpu_id) | ||||
|     device.use() | ||||
|     Model.ops = CupyOps() | ||||
|     Model.Ops = CupyOps | ||||
|     return device | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,6 +6,8 @@ import msgpack | |||
| import msgpack_numpy | ||||
| msgpack_numpy.patch() | ||||
| cimport numpy as np | ||||
| from thinc.neural.util import get_array_module | ||||
| from thinc.neural._classes.model import Model | ||||
| 
 | ||||
| from .typedefs cimport attr_t | ||||
| from .strings cimport StringStore | ||||
|  | @ -14,15 +16,29 @@ from .compat import basestring_ | |||
| 
 | ||||
| 
 | ||||
| cdef class Vectors: | ||||
|     '''Store, save and load word vectors.''' | ||||
|     '''Store, save and load word vectors. | ||||
| 
 | ||||
|     Vectors data is kept in the vectors.data attribute, which should be an | ||||
|     instance of numpy.ndarray (for CPU vectors) | ||||
|     or cupy.ndarray (for GPU vectors). | ||||
| 
 | ||||
|     vectors.key2row is a dictionary mapping word hashes to rows | ||||
|     in the vectors.data table. The array `vectors.keys` keeps | ||||
|     the keys in order, such that keys[vectors.key2row[key]] == key. | ||||
|     ''' | ||||
|     cdef public object data | ||||
|     cdef readonly StringStore strings | ||||
|     cdef public object key2row | ||||
|     cdef public object keys | ||||
|     cdef public int i | ||||
| 
 | ||||
|     def __init__(self, strings, data_or_width): | ||||
|     def __init__(self, strings, data_or_width=0): | ||||
|         if isinstance(strings, StringStore): | ||||
|             self.strings = strings | ||||
|         else: | ||||
|             self.strings = StringStore() | ||||
|             for string in strings: | ||||
|                 self.strings.add(string) | ||||
|         if isinstance(data_or_width, int): | ||||
|             self.data = data = numpy.zeros((len(strings), data_or_width), | ||||
|                                            dtype='f') | ||||
|  | @ -37,6 +53,11 @@ cdef class Vectors: | |||
|         return (Vectors, (self.strings, self.data)) | ||||
| 
 | ||||
|     def __getitem__(self, key): | ||||
|         '''Get a vector by key. If key is a string, it is hashed | ||||
|         to an integer ID using the vectors.strings table. | ||||
| 
 | ||||
|         If the integer key is not found in the table, a KeyError is raised. | ||||
|         ''' | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings[key] | ||||
|         i = self.key2row[key] | ||||
|  | @ -46,23 +67,30 @@ cdef class Vectors: | |||
|             return self.data[i] | ||||
| 
 | ||||
|     def __setitem__(self, key, vector): | ||||
|         '''Set a vector for the given key. If key is a string, it is hashed | ||||
|         to an integer ID using the vectors.strings table. | ||||
|         ''' | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings.add(key) | ||||
|         i = self.key2row[key] | ||||
|         self.data[i] = vector | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         '''Yield vectors from the table.''' | ||||
|         yield from self.data | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         '''Return the number of vectors that have been assigned.''' | ||||
|         return self.i | ||||
| 
 | ||||
|     def __contains__(self, key): | ||||
|         '''Check whether a key has a vector entry in the table.''' | ||||
|         if isinstance(key, basestring_): | ||||
|             key = self.strings[key] | ||||
|         return key in self.key2row | ||||
| 
 | ||||
|     def add(self, key, vector=None): | ||||
|         '''Add a key to the table, optionally setting a vector value as well.''' | ||||
|         if isinstance(key, basestring_): | ||||
|             key = self.strings.add(key) | ||||
|         if key not in self.key2row: | ||||
|  | @ -80,7 +108,9 @@ cdef class Vectors: | |||
|         return i | ||||
| 
 | ||||
|     def items(self): | ||||
|         for i, string in enumerate(self.strings): | ||||
|         '''Iterate over (string key, vector) pairs, in order.''' | ||||
|         for i, key in enumerate(self.keys): | ||||
|             string = self.strings[key] | ||||
|             yield string, self.data[i] | ||||
| 
 | ||||
|     @property | ||||
|  | @ -118,9 +148,14 @@ cdef class Vectors: | |||
|             self.data | ||||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         xp = get_array_module(self.data) | ||||
|         if xp is numpy: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) | ||||
|         else: | ||||
|             save_array = lambda arr, file_: xp.save(file_, arr) | ||||
|         serializers = OrderedDict(( | ||||
|             ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), | ||||
|             ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), | ||||
|             ('vectors', lambda p: save_array(self.data, p.open('wb'))), | ||||
|             ('keys', lambda p: xp.save(p.open('wb'), self.keys)) | ||||
|         )) | ||||
|         return util.to_disk(path, serializers, exclude) | ||||
| 
 | ||||
|  | @ -133,8 +168,9 @@ cdef class Vectors: | |||
|                     self.key2row[key] = i | ||||
| 
 | ||||
|         def load_vectors(path): | ||||
|             xp = Model.ops.xp | ||||
|             if path.exists(): | ||||
|                 self.data = numpy.load(path) | ||||
|                 self.data = xp.load(path) | ||||
| 
 | ||||
|         serializers = OrderedDict(( | ||||
|             ('keys', load_keys), | ||||
|  |  | |||
|  | @ -27,6 +27,7 @@ from .vectors import Vectors | |||
| from . import util | ||||
| from . import attrs | ||||
| from . import symbols | ||||
| from ._ml import link_vectors_to_models | ||||
| 
 | ||||
| 
 | ||||
| cdef class Vocab: | ||||
|  | @ -65,7 +66,7 @@ cdef class Vocab: | |||
|                 self.strings.add(name) | ||||
|         self.lex_attr_getters = lex_attr_getters | ||||
|         self.morphology = Morphology(self.strings, tag_map, lemmatizer) | ||||
|         self.vectors = Vectors(self.strings, 300) | ||||
|         self.vectors = Vectors(self.strings) | ||||
| 
 | ||||
|     property lang: | ||||
|         def __get__(self): | ||||
|  | @ -261,7 +262,7 @@ cdef class Vocab: | |||
|         Words can be looked up by string or int ID. | ||||
| 
 | ||||
|         RETURNS: | ||||
|             A word vector. Size and shape determed by the | ||||
|             A word vector. Size and shape determined by the | ||||
|             vocab.vectors instance. Usually, a numpy ndarray | ||||
|             of shape (300,) and dtype float32. | ||||
| 
 | ||||
|  | @ -323,6 +324,7 @@ cdef class Vocab: | |||
|             self.lexemes_from_bytes(file_.read()) | ||||
|         if self.vectors is not None: | ||||
|             self.vectors.from_disk(path, exclude='strings.json') | ||||
|         link_vectors_to_models(self) | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|  | @ -436,6 +438,7 @@ def unpickle_vocab(sstore, morphology, data_dir, | |||
|     vocab.lex_attr_getters = lex_attr_getters | ||||
|     vocab.lexemes_from_bytes(lexemes_data) | ||||
|     vocab.length = length | ||||
|     link_vectors_to_models(vocab) | ||||
|     return vocab | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -17,6 +17,7 @@ fi | |||
| 
 | ||||
| if [ "${VIA}" == "compile" ]; then | ||||
|   pip install -r requirements.txt | ||||
|   python setup.py build_ext --inplace | ||||
|   pip install -e . | ||||
| fi | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,4 +8,5 @@ include _includes/_mixins | |||
|         | does not exist! | ||||
| 
 | ||||
|     h2.c-landing__title.u-heading-3.u-padding-small | ||||
|         a(href="javascript:history.go(-1)") Click here to go back. | ||||
|         +button(false, true, "secondary-light")(href="javascript:history.go(-1)") | ||||
|             |  Click here to go back | ||||
|  |  | |||
|  | @ -3,24 +3,22 @@ | |||
|         "landing": true, | ||||
|         "logos": [ | ||||
|             { | ||||
|                 "quora": [ "https://www.quora.com", 150 ], | ||||
|                 "chartbeat": [ "https://chartbeat.com", 200 ], | ||||
|                 "duedil": [ "https://www.duedil.com", 150 ], | ||||
|                 "stitchfix": [ "https://www.stitchfix.com", 190 ] | ||||
|                 "airbnb": [ "https://www.airbnb.com", 150, 45], | ||||
|                 "quora": [ "https://www.quora.com", 120, 34 ], | ||||
|                 "retriever": [ "https://www.retriever.no", 150, 33 ], | ||||
|                 "stitchfix": [ "https://www.stitchfix.com", 150, 18 ] | ||||
|             }, | ||||
|             { | ||||
|                 "wayblazer": [ "http://wayblazer.com", 200 ], | ||||
|                 "indico": [ "https://indico.io", 150 ], | ||||
|                 "chattermill": [ "https://chattermill.io", 175 ], | ||||
|                 "turi": [ "https://turi.com", 150 ], | ||||
|                 "kip": [ "http://kipthis.com", 70 ] | ||||
|             }, | ||||
|                 "chartbeat": [ "https://chartbeat.com", 180, 25 ], | ||||
|                 "allenai": [ "https://allenai.org", 220, 37 ] | ||||
|             } | ||||
|         ], | ||||
|         "features": [ | ||||
|             { | ||||
|                 "socrata": [ "https://www.socrata.com", 150 ], | ||||
|                 "cytora": [ "http://www.cytora.com", 125 ], | ||||
|                 "signaln": [ "http://signaln.com", 150 ], | ||||
|                 "wonderflow": [ "http://www.wonderflow.co", 200 ], | ||||
|                 "synapsify": [ "http://www.gosynapsify.com", 150 ] | ||||
|                 "thoughtworks": ["https://www.thoughtworks.com/radar/tools", 150, 28], | ||||
|                 "wapo": ["https://www.washingtonpost.com/news/wonk/wp/2016/05/18/googles-new-artificial-intelligence-cant-understand-these-sentences-can-you/", 100, 77], | ||||
|                 "venturebeat": ["https://venturebeat.com/2017/01/27/4-ai-startups-that-analyze-customer-reviews/", 150, 19], | ||||
|                 "microsoft": ["https://www.microsoft.com/developerblog/2016/09/13/training-a-classifier-for-relation-extraction-from-medical-literature/", 130, 28] | ||||
|             } | ||||
|         ] | ||||
|     }, | ||||
|  | @ -34,7 +32,24 @@ | |||
|         "landing": true | ||||
|     }, | ||||
| 
 | ||||
|     "announcement" : { | ||||
|         "title": "Important Announcement" | ||||
|     "styleguide": { | ||||
|         "title": "Styleguide", | ||||
|         "sidebar": { | ||||
|             "Styleguide": { "": "styleguide" }, | ||||
|             "Resources": { | ||||
|                 "Website Source": "https://github.com/explosion/spacy/tree/master/website", | ||||
|                 "Contributing Guide": "https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md" | ||||
|             } | ||||
|         }, | ||||
|         "menu": { | ||||
|             "Introduction": "intro", | ||||
|             "Logo": "logo", | ||||
|             "Colors": "colors", | ||||
|             "Typography": "typography", | ||||
|             "Elements": "elements", | ||||
|             "Components": "components", | ||||
|             "Embeds": "embeds", | ||||
|             "Markup Reference": "markup" | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  |  | |||
|  | @ -11,12 +11,9 @@ | |||
|         "COMPANY": "Explosion AI", | ||||
|         "COMPANY_URL": "https://explosion.ai", | ||||
|         "DEMOS_URL": "https://demos.explosion.ai", | ||||
|         "MODELS_REPO": "explosion/spacy-models", | ||||
| 
 | ||||
|         "SPACY_VERSION": "1.8", | ||||
|         "LATEST_NEWS": { | ||||
|             "url": "https://github.com/explosion/spaCy/releases/tag/v2.0.0-alpha", | ||||
|             "title": "Test spaCy v2.0.0 alpha!" | ||||
|         }, | ||||
|         "SPACY_VERSION": "2.0", | ||||
| 
 | ||||
|         "SOCIAL": { | ||||
|             "twitter": "spacy_io", | ||||
|  | @ -27,25 +24,23 @@ | |||
|         }, | ||||
| 
 | ||||
|         "NAVIGATION": { | ||||
|             "Home": "/", | ||||
|             "Usage": "/docs/usage", | ||||
|             "Reference": "/docs/api", | ||||
|             "Demos": "/docs/usage/showcase", | ||||
|             "Blog": "https://explosion.ai/blog" | ||||
|             "Usage": "/usage", | ||||
|             "Models": "/models", | ||||
|             "API": "/api" | ||||
|         }, | ||||
| 
 | ||||
|         "FOOTER": { | ||||
|             "spaCy": { | ||||
|                 "Usage": "/docs/usage", | ||||
|                 "API Reference": "/docs/api", | ||||
|                 "Tutorials": "/docs/usage/tutorials", | ||||
|                 "Showcase": "/docs/usage/showcase" | ||||
|                 "Usage": "/usage", | ||||
|                 "Models": "/models", | ||||
|                 "API Reference": "/api", | ||||
|                 "Resources": "/usage/resources" | ||||
|             }, | ||||
|             "Support": { | ||||
|                 "Issue Tracker": "https://github.com/explosion/spaCy/issues", | ||||
|                 "StackOverflow": "http://stackoverflow.com/questions/tagged/spacy", | ||||
|                 "Reddit usergroup": "https://www.reddit.com/r/spacynlp/", | ||||
|                 "Gitter chat": "https://gitter.im/explosion/spaCy" | ||||
|                 "Reddit Usergroup": "https://www.reddit.com/r/spacynlp/", | ||||
|                 "Gitter Chat": "https://gitter.im/explosion/spaCy" | ||||
|             }, | ||||
|             "Connect": { | ||||
|                 "Twitter": "https://twitter.com/spacy_io", | ||||
|  | @ -74,21 +69,11 @@ | |||
|                 {"id": "venv", "title": "virtualenv", "help": "Use a virtual environment and install spaCy into a user directory" }, | ||||
|                 {"id": "gpu", "title": "GPU", "help": "Run spaCy on GPU to make it faster. Requires an NVDIA graphics card with CUDA 2+. See section below for more info."}] | ||||
|             }, | ||||
|             { "id": "model", "title": "Models", "multiple": true, "options": [ | ||||
|                 { "id": "en", "title": "English", "meta": "50MB" }, | ||||
|                 { "id": "de", "title": "German", "meta": "645MB" }, | ||||
|                 { "id": "fr", "title": "French", "meta": "1.33GB" }, | ||||
|                 { "id": "es", "title": "Spanish", "meta": "377MB"}] | ||||
|             } | ||||
|             { "id": "model", "title": "Models", "multiple": true } | ||||
|         ], | ||||
| 
 | ||||
|         "QUICKSTART_MODELS": [ | ||||
|             { "id": "lang", "title": "Language", "options": [ | ||||
|                 { "id": "en", "title": "English", "checked": true }, | ||||
|                 { "id": "de", "title": "German" }, | ||||
|                 { "id": "fr", "title": "French" }, | ||||
|                 { "id": "es", "title": "Spanish" }] | ||||
|             }, | ||||
|             { "id": "lang", "title": "Language"}, | ||||
|             { "id": "load", "title": "Loading style", "options": [ | ||||
|                 { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, | ||||
|                 {  "id": "module", "title": "Import as module", "help": "Import the model explicitly as a Python module." }] | ||||
|  | @ -98,50 +83,15 @@ | |||
|             } | ||||
|         ], | ||||
| 
 | ||||
|         "MODELS": { | ||||
|             "en": [ | ||||
|                 { "id": "en_core_web_sm", "lang": "English", "feats": [1, 1, 1, 1], "size": "50 MB", "license": "CC BY-SA", "def": true }, | ||||
|                 { "id": "en_core_web_md", "lang": "English", "feats": [1, 1, 1, 1], "size": "1 GB", "license": "CC BY-SA" }, | ||||
|                 { "id": "en_depent_web_md", "lang": "English", "feats": [1, 1, 1, 0], "size": "328 MB", "license": "CC BY-SA" }, | ||||
|                 { "id": "en_vectors_glove_md", "lang": "English", "feats": [1, 0, 0, 1], "size": "727 MB", "license": "CC BY-SA" } | ||||
|             ], | ||||
|             "de": [ | ||||
|                 { "id": "de_core_news_md", "lang": "German", "feats": [1, 1, 1, 1], "size": "645 MB", "license": "CC BY-SA" } | ||||
|             ], | ||||
|             "fr": [ | ||||
|                 { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } | ||||
|             ], | ||||
|             "es": [ | ||||
|                 { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"} | ||||
|             ] | ||||
|         }, | ||||
| 
 | ||||
|         "EXAMPLE_SENTENCES": { | ||||
|             "en": "This is a sentence.", | ||||
|             "de": "Dies ist ein Satz.", | ||||
|             "fr": "C'est une phrase.", | ||||
|             "es": "Esto es una frase." | ||||
|         }, | ||||
| 
 | ||||
|         "ALPHA": true, | ||||
|         "V_CSS": "1.6", | ||||
|         "V_JS": "1.2", | ||||
|         "V_CSS": "2.0", | ||||
|         "V_JS": "2.0", | ||||
|         "DEFAULT_SYNTAX": "python", | ||||
|         "ANALYTICS": "UA-58931649-1", | ||||
|         "MAILCHIMP": { | ||||
|             "user": "spacy.us12", | ||||
|             "id": "83b0498b1e7fa3c91ce68c3f1", | ||||
|             "list": "89ad33e698" | ||||
|         }, | ||||
|         "BADGES": { | ||||
|             "pipy": { | ||||
|                 "badge": "https://img.shields.io/pypi/v/spacy.svg?style=flat-square", | ||||
|                 "link": "https://pypi.python.org/pypi/spacy" | ||||
|             }, | ||||
|             "conda": { | ||||
|                 "badge": "https://anaconda.org/conda-forge/spacy/badges/version.svg", | ||||
|                 "link": "https://anaconda.org/conda-forge/spacy" | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  |  | |||
|  | @ -1,8 +1,6 @@ | |||
| //- 💫 INCLUDES > FOOTER | ||||
| 
 | ||||
| include _mixins | ||||
| 
 | ||||
| footer.o-footer.u-text.u-border-dotted | ||||
| footer.o-footer.u-text | ||||
|     +grid.o-content | ||||
|         each group, label in FOOTER | ||||
|             +grid-col("quarter") | ||||
|  | @ -13,18 +11,18 @@ footer.o-footer.u-text.u-border-dotted | |||
|                         li | ||||
|                             +a(url)=item | ||||
| 
 | ||||
|         if SECTION != "docs" | ||||
|         if SECTION == "index" | ||||
|             +grid-col("quarter") | ||||
|                 include _newsletter | ||||
| 
 | ||||
|     if SECTION == "docs" | ||||
|     if SECTION != "index" | ||||
|         .o-content.o-block.u-border-dotted | ||||
|             include _newsletter | ||||
| 
 | ||||
|     .o-inline-list.u-text-center.u-text-tiny.u-color-subtle | ||||
|         span © 2016-#{new Date().getFullYear()} #[+a(COMPANY_URL, true)=COMPANY] | ||||
| 
 | ||||
|         +a(COMPANY_URL, true) | ||||
|             +svg("graphics", "explosion", 45).o-icon.u-color-theme.u-grayscale | ||||
|         +a(COMPANY_URL, true)(aria-label="Explosion AI") | ||||
|             +icon("explosion", 45).o-icon.u-color-theme.u-grayscale | ||||
| 
 | ||||
|         +a(COMPANY_URL + "/legal", true) Legal / Imprint | ||||
|  |  | |||
|  | @ -1,35 +1,71 @@ | |||
| //- 💫 INCLUDES > FUNCTIONS | ||||
| 
 | ||||
| //- More descriptive variables for current.path and current.source | ||||
| //- Descriptive variables, available in the global scope | ||||
| 
 | ||||
| - CURRENT = current.source | ||||
| - SECTION = current.path[0] | ||||
| - SUBSECTION = current.path[1] | ||||
| - LANGUAGES = public.models._data.LANGUAGES | ||||
| - MODELS = public.models._data.MODELS | ||||
| - CURRENT_MODELS = MODELS[current.source] || [] | ||||
| 
 | ||||
| - MODEL_COUNT = Object.keys(MODELS).map(m => Object.keys(MODELS[m]).length).reduce((a, b) => a + b) | ||||
| - MODEL_LANG_COUNT = Object.keys(MODELS).length | ||||
| - LANG_COUNT = Object.keys(LANGUAGES).length | ||||
| 
 | ||||
| - MODEL_META = public.models._data.MODEL_META | ||||
| - MODEL_LICENSES = public.models._data.MODEL_LICENSES | ||||
| - MODEL_ACCURACY = public.models._data.MODEL_ACCURACY | ||||
| - EXAMPLE_SENTENCES = public.models._data.EXAMPLE_SENTENCES | ||||
| 
 | ||||
| - IS_PAGE = (SECTION != "index") && !landing | ||||
| - IS_MODELS = (SECTION == "models" && LANGUAGES[current.source]) | ||||
| - HAS_MODELS = IS_MODELS && CURRENT_MODELS.length | ||||
| 
 | ||||
| 
 | ||||
| //- Add prefixes to items of an array (for modifier CSS classes) | ||||
|     array   - [array] list of class names or options, e.g. ["foot"] | ||||
|     prefix  - [string] prefix to add to each class, e.g. "c-table__row" | ||||
|     RETURNS - [array] list of modified class names | ||||
| 
 | ||||
| -   function prefixArgs(array, prefix) { | ||||
| -       return array.map(function(arg) { | ||||
| -           return prefix + '--' + arg; | ||||
| -       }).join(' '); | ||||
| -       return array.map(arg => prefix + '--' + arg).join(' '); | ||||
| -   } | ||||
| 
 | ||||
| 
 | ||||
| //- Convert API paths (semi-temporary fix for renamed sections) | ||||
|     path    - [string] link path supplied to +api mixin | ||||
|     RETURNS - [string] new link path to correct location | ||||
| 
 | ||||
| -   function convertAPIPath(path) { | ||||
| -       if (path.startsWith('spacy#') || path.startsWith('displacy#') || path.startsWith('util#')) { | ||||
| -           var comps = path.split('#'); | ||||
| -           return "top-level#" + comps[0] + '.' + comps[1]; | ||||
| -       } | ||||
| -       else if (path.startsWith('cli#')) { | ||||
| -           return "top-level#" + path.split('#')[1]; | ||||
| -       } | ||||
| -       return path; | ||||
| -   } | ||||
| 
 | ||||
| 
 | ||||
| //- Get model components from ID. Components can then be looked up in LANGUAGES | ||||
|     and MODEL_META respectively, to get their human-readable form. | ||||
|     id      - [string] model ID, e.g. "en_core_web_sm" | ||||
|     RETURNS - [object] object keyed by components lang, type, genre and size | ||||
| 
 | ||||
| -   function getModelComponents(id) { | ||||
| -       var comps = id.split('_'); | ||||
| -       return {'lang': comps[0], 'type': comps[1], 'genre': comps[2], 'size': comps[3]} | ||||
| -   } | ||||
| 
 | ||||
| 
 | ||||
| //- Generate GitHub links | ||||
|     repo     - [string] name of repo owned by explosion | ||||
|     filepath - [string] logical path to file relative to repository root | ||||
|     branch   - [string] optional branch, defaults to "master" | ||||
|     RETURNS  - [string] the correct link to the file on GitHub | ||||
| 
 | ||||
| -   function gh(repo, filepath, branch) { | ||||
| -       var branch = ALPHA ? 'develop' : branch | ||||
| -       return 'https://github.com/' + SOCIAL.github + '/' + repo + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); | ||||
| -   } | ||||
| 
 | ||||
| 
 | ||||
| //- Get social images | ||||
| 
 | ||||
| -   function getSocialImg() { | ||||
| -       var base = SITE_URL + '/assets/img/social/preview_' | ||||
| -       var image = ALPHA ? 'alpha' : 'default' | ||||
| -       if (preview) image = preview | ||||
| -       else if (SECTION == 'docs' && !ALPHA) image = 'docs' | ||||
| -       return base + image + '.jpg' | ||||
| -       return 'https://github.com/' + SOCIAL.github + '/' + (repo || '') + (filepath ? '/blob/' + (branch || 'master') + '/' + filepath : '' ); | ||||
| -   } | ||||
|  |  | |||
|  | @ -1,5 +1,13 @@ | |||
| //- 💫 MIXINS > BASE | ||||
| 
 | ||||
| //- Section | ||||
|     id - [string] anchor assigned to section (used for breadcrumb navigation) | ||||
| 
 | ||||
| mixin section(id) | ||||
|     section.o-section(id="section-" + id data-section=id) | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
| //- Aside wrapper | ||||
|     label - [string] aside label | ||||
| 
 | ||||
|  | @ -11,34 +19,26 @@ mixin aside-wrapper(label) | |||
| 
 | ||||
|             block | ||||
| 
 | ||||
| //- Date | ||||
|     input - [string] date in the format YYYY-MM-DD | ||||
| 
 | ||||
| mixin date(input) | ||||
|     - var date = new Date(input) | ||||
|     - var months = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ] | ||||
| 
 | ||||
|     time(datetime=JSON.parse(JSON.stringify(date)))&attributes(attributes)=months[date.getMonth()] + ' ' + date.getDate() + ', ' + date.getFullYear() | ||||
| 
 | ||||
| 
 | ||||
| //- SVG from map | ||||
|     file   - [string] SVG file name in /assets/img/ | ||||
| //- SVG from map (uses embedded SVG sprite) | ||||
|     name   - [string] SVG symbol id | ||||
|     width  - [integer] width in px | ||||
|     height - [integer] height in px (default: same as width) | ||||
| 
 | ||||
| mixin svg(file, name, width, height) | ||||
| mixin svg(name, width, height) | ||||
|     svg(aria-hidden="true" viewBox="0 0 #{width} #{height || width}" width=width height=(height || width))&attributes(attributes) | ||||
|         use(xlink:href="/assets/img/#{file}.svg##{name}") | ||||
|         use(xlink:href="#svg_#{name}") | ||||
| 
 | ||||
| 
 | ||||
| //- Icon | ||||
|     name - [string] icon name, should be SVG symbol ID | ||||
|     size - [integer] icon width and height (default: 20) | ||||
|     name   - [string] icon name (will be used as symbol id: #svg_{name}) | ||||
|     width  - [integer] icon width (default: 20) | ||||
|     height - [integer] icon height (defaults to width) | ||||
| 
 | ||||
| mixin icon(name, size) | ||||
|     - var size = size || 20 | ||||
|     +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) | ||||
| mixin icon(name, width, height) | ||||
|     - var width = width || 20 | ||||
|     - var height = height || width | ||||
|     +svg(name, width, height).o-icon(style="min-width: #{width}px")&attributes(attributes) | ||||
| 
 | ||||
| 
 | ||||
| //- Pro/Con/Neutral icon | ||||
|  | @ -46,8 +46,8 @@ mixin icon(name, size) | |||
|     size - [integer] icon size (optional) | ||||
| 
 | ||||
| mixin procon(icon, size) | ||||
|     - colors = { pro: "green", con: "red", neutral: "yellow" } | ||||
|     +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) | ||||
|     - colors = { pro: "green", con: "red", neutral: "subtle" } | ||||
|     +icon("circle", size || 16)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) | ||||
| 
 | ||||
| 
 | ||||
| //- Headlines Helper Mixin | ||||
|  | @ -80,8 +80,7 @@ mixin headline(level) | |||
| 
 | ||||
| mixin permalink(id) | ||||
|     if id | ||||
|         a.u-permalink(id=id href="##{id}") | ||||
|             +icon("anchor").u-permalink__icon | ||||
|         a.u-permalink(href="##{id}") | ||||
|             block | ||||
| 
 | ||||
|     else | ||||
|  | @ -109,7 +108,7 @@ mixin quickstart(groups, headline, description, hide_results) | |||
|                     .c-quickstart__fields | ||||
|                         for option in group.options | ||||
|                             input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id="qs-#{option.id}" value=option.id checked=option.checked) | ||||
|                             label.c-quickstart__label(for="qs-#{option.id}")!=option.title | ||||
|                             label.c-quickstart__label.u-text-tiny(for="qs-#{option.id}")!=option.title | ||||
|                                 if option.meta | ||||
|                                     |  #[span.c-quickstart__label__meta (#{option.meta})] | ||||
|                                 if option.help | ||||
|  | @ -122,12 +121,10 @@ mixin quickstart(groups, headline, description, hide_results) | |||
|                 code.c-code-block__content.c-quickstart__code(data-qs-results="") | ||||
|                     block | ||||
| 
 | ||||
|     .c-quickstart__info.u-text-tiny.o-block.u-text-right | ||||
|         |  Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]! | ||||
| 
 | ||||
| 
 | ||||
| //- Quickstart code item | ||||
|     data [object] - Rendering conditions (keyed by option group ID, value: option) | ||||
|     data  - [object] Rendering conditions (keyed by option group ID, value: option) | ||||
|     style - [string] modifier ID for line style | ||||
| 
 | ||||
| mixin qs(data, style) | ||||
|     - args = {} | ||||
|  | @ -148,6 +145,13 @@ mixin terminal(label) | |||
|         +code.x-terminal__code | ||||
|             block | ||||
| 
 | ||||
| //- Chart.js | ||||
|     id - [string] chart ID, will be assigned as #chart_{id} | ||||
| 
 | ||||
| mixin chart(id) | ||||
|     figure.o-block&attributes(attributes) | ||||
|         canvas(id="chart_#{id}" width="800" height="400" style="max-width: 100%") | ||||
| 
 | ||||
| 
 | ||||
| //- Gitter chat button and widget | ||||
|     button - [string] text shown on button | ||||
|  | @ -156,26 +160,24 @@ mixin terminal(label) | |||
| mixin gitter(button, label) | ||||
|     aside.js-gitter.c-chat.is-collapsed(data-title=(label || button)) | ||||
| 
 | ||||
|     button.js-gitter-button.c-chat__button.u-text-small | ||||
|         +icon("chat").o-icon--inline | ||||
|     button.js-gitter-button.c-chat__button.u-text-tag | ||||
|         +icon("chat", 16).o-icon--inline | ||||
|         !=button | ||||
| 
 | ||||
| 
 | ||||
| //- Badge | ||||
|     name - [string] "pipy" or "conda" | ||||
|     image - [string] path to badge image | ||||
|     url   - [string] badge link | ||||
| 
 | ||||
| mixin badge(name) | ||||
|     - site = BADGES[name] | ||||
| 
 | ||||
|     if site | ||||
|         +a(site.link).u-padding-small | ||||
|             img(src=site.badge alt="{name} version" height="20") | ||||
| mixin badge(image, url) | ||||
|     +a(url).u-padding-small.u-hide-link&attributes(attributes) | ||||
|         img.o-badge(src=image alt=url height="20") | ||||
| 
 | ||||
| 
 | ||||
| //- Logo | ||||
| //- spaCy logo | ||||
| 
 | ||||
| mixin logo() | ||||
|     +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) | ||||
|     +svg("spacy", 675, 215).o-logo&attributes(attributes) | ||||
| 
 | ||||
| 
 | ||||
| //- Landing | ||||
|  | @ -186,18 +188,56 @@ mixin landing-header() | |||
|             .c-landing__content | ||||
|                 block | ||||
| 
 | ||||
| mixin landing-banner(headline, label) | ||||
|     .c-landing__banner.u-padding.o-block.u-color-light | ||||
|         +grid.c-landing__banner__content.o-no-block | ||||
|             +grid-col("third") | ||||
|                 h3.u-heading.u-heading-1 | ||||
|                     if label | ||||
|                         div | ||||
|                             span.u-text-label.u-text-label--light=label | ||||
|                     !=headline | ||||
| 
 | ||||
| mixin landing-badge(url, graphic, alt, size) | ||||
|     +a(url)(aria-label=alt title=alt).c-landing__badge | ||||
|         +svg("graphics", graphic, size || 225) | ||||
|             +grid-col("two-thirds").c-landing__banner__text | ||||
|                 block | ||||
| 
 | ||||
| 
 | ||||
| mixin landing-logos(title, logos) | ||||
|     .o-content.u-text-center&attributes(attributes) | ||||
|         h3.u-heading.u-text-label.u-color-dark=title | ||||
| 
 | ||||
|         each row, i in logos | ||||
|             - var is_last = i == logos.length - 1 | ||||
|             +grid("center").o-inline-list.o-no-block(class=is_last ? "o-no-block" : null) | ||||
|                 each details, name in row | ||||
|                     +a(details[0]).u-padding-medium | ||||
|                         +icon(name, details[1], details[2]) | ||||
| 
 | ||||
|                 if is_last | ||||
|                     block | ||||
| 
 | ||||
| 
 | ||||
| //- Under construction (temporary) | ||||
|     Marks sections that still need to be completed for the v2.0 release. | ||||
| 
 | ||||
| mixin under-construction() | ||||
|     +infobox("🚧 Under construction") | ||||
|     +infobox("Under construction", "🚧") | ||||
|         |  This section is still being written and will be updated for the v2.0 | ||||
|         |  release. Is there anything that you think should definitely mentioned or | ||||
|         |  explained here? Any examples you'd like to see? #[strong Let us know] | ||||
|         |  on the #[+a(gh("spacy") + "/issues/1105") v2.0 alpha thread] on GitHub! | ||||
| 
 | ||||
| 
 | ||||
| //- Alpha infobox (temporary) | ||||
|     Added in the templates to notify user that they're visiting the alpha site. | ||||
| 
 | ||||
| mixin alpha-info() | ||||
|     +infobox("You are viewing the spaCy v2.0.0 alpha docs", "⚠️") | ||||
|         strong This page is part of the alpha documentation for spaCy v2.0. | ||||
|         |  It does not reflect the state of the latest stable release. | ||||
|         |  Because v2.0 is still under development, the implementation | ||||
|         |  may differ from the intended state described here. See the | ||||
|         |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes] | ||||
|         |  for details on how to install and test the new version. To | ||||
|         |  read the official docs for spaCy v1.x, | ||||
|         |  #[+a("https://spacy.io/docs") go here]. | ||||
|  |  | |||
|  | @ -8,11 +8,15 @@ include _mixins-base | |||
|     level - [integer] headline level, corresponds to h1, h2, h3 etc. | ||||
|     id    - [string] unique identifier, creates permalink (optional) | ||||
| 
 | ||||
| mixin h(level, id) | ||||
|     +headline(level).u-heading&attributes(attributes) | ||||
| mixin h(level, id, source) | ||||
|     +headline(level).u-heading(id=id)&attributes(attributes) | ||||
|         +permalink(id) | ||||
|             block | ||||
| 
 | ||||
|         if source | ||||
|             +button(gh("spacy", source), false, "secondary", "small").u-nowrap.u-float-right | ||||
|                 span Source #[+icon("code", 14).o-icon--inline] | ||||
| 
 | ||||
| 
 | ||||
| //- External links | ||||
|     url     - [string] link href | ||||
|  | @ -38,21 +42,23 @@ mixin src(url) | |||
| 
 | ||||
| 
 | ||||
| //- API link (with added tag and automatically generated path) | ||||
|     path - [string] path to API docs page relative to /docs/api/ | ||||
|     path - [string] path to API docs page relative to /api/ | ||||
| 
 | ||||
| mixin api(path) | ||||
|     +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap | ||||
|     - path = convertAPIPath(path) | ||||
|     +a("/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap | ||||
|         block | ||||
| 
 | ||||
|         |  #[+icon("book", 18).o-icon--inline.u-color-theme] | ||||
|         |  #[+icon("book", 16).o-icon--inline.u-color-theme] | ||||
| 
 | ||||
| 
 | ||||
| //- Help icon with tooltip | ||||
|     tooltip   - [string] Tooltip text | ||||
|     icon_size - [integer] Optional size of help icon in px. | ||||
| 
 | ||||
| mixin help(tooltip) | ||||
| mixin help(tooltip, icon_size) | ||||
|     span(data-tooltip=tooltip)&attributes(attributes) | ||||
|         +icon("help", 16).i-icon--inline | ||||
|         +icon("help", icon_size || 16).o-icon--inline | ||||
| 
 | ||||
| 
 | ||||
| //- Aside for text | ||||
|  | @ -68,24 +74,43 @@ mixin aside(label) | |||
|     label    - [string] aside title (optional or false for no label) | ||||
|     language - [string] language for syntax highlighting (default: "python") | ||||
|                supports basic relevant languages available for PrismJS | ||||
|     prompt   - [string] prompt displayed before first line, e.g. "$" | ||||
| 
 | ||||
| mixin aside-code(label, language) | ||||
| mixin aside-code(label, language, prompt) | ||||
|     +aside-wrapper(label) | ||||
|         +code(false, language).o-no-block | ||||
|         +code(false, language, prompt).o-no-block | ||||
|             block | ||||
| 
 | ||||
| 
 | ||||
| //- Infobox | ||||
|     label - [string] infobox title (optional or false for no title) | ||||
|     emoji - [string] optional emoji displayed before the title, necessary as | ||||
|             argument to be able to wrap it for spacing | ||||
| 
 | ||||
| mixin infobox(label) | ||||
| mixin infobox(label, emoji) | ||||
|     aside.o-box.o-block.u-text-small | ||||
|         if label | ||||
|             h3.u-text-label.u-color-theme=label | ||||
|             h3.u-heading.u-text-label.u-color-theme | ||||
|                 if emoji | ||||
|                     span.o-emoji=emoji | ||||
|                 |  #{label} | ||||
| 
 | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
| //- Logos displayed in the top corner of some infoboxes | ||||
|     logos - [array] List of icon ID, width, height and link. | ||||
| 
 | ||||
| mixin infobox-logos(...logos) | ||||
|     .o-box__logos.u-text-right.u-float-right | ||||
|         for logo in logos | ||||
|             if logo[3] | ||||
|                 |  #[+a(logo[3]).u-inline-block.u-hide-link.u-padding-small #[+icon(logo[0], logo[1], logo[2]).u-color-dark]] | ||||
|             else | ||||
|                 |  #[+icon(logo[0], logo[1], logo[2]).u-color-dark] | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| //- Link button | ||||
|     url      - [string] link href | ||||
|     trusted  - [boolean] if not set / false, rel="noopener nofollow" is added | ||||
|  | @ -94,7 +119,7 @@ mixin infobox(label) | |||
|                see assets/css/_components/_buttons.sass | ||||
| 
 | ||||
| mixin button(url, trusted, ...style) | ||||
|     - external = url.includes("http") | ||||
|     - external = url && url.includes("http") | ||||
|     a.c-button.u-text-label(href=url class=prefixArgs(style, "c-button") role="button" target=external ? "_blank" : null rel=external && !trusted ? "noopener nofollow" : null)&attributes(attributes) | ||||
|         block | ||||
| 
 | ||||
|  | @ -103,31 +128,33 @@ mixin button(url, trusted, ...style) | |||
|     label    - [string] aside title (optional or false for no label) | ||||
|     language - [string] language for syntax highlighting (default: "python") | ||||
|                supports basic relevant languages available for PrismJS | ||||
|     prompt    - [string] prompt or icon to display next to code block, (mostly used for old/new) | ||||
|     prompt   - [string] prompt displayed before first line, e.g. "$" | ||||
|     height   - [integer] optional height to clip code block to | ||||
|     icon     - [string] icon displayed next to code block (e.g. "accept" for new code) | ||||
|     wrap     - [boolean] wrap text and disable horizontal scrolling | ||||
| 
 | ||||
| mixin code(label, language, prompt, height) | ||||
| mixin code(label, language, prompt, height, icon, wrap) | ||||
|     pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) | ||||
|         if label | ||||
|             h4.u-text-label.u-text-label--dark=label | ||||
|         - var icon = (prompt == 'accept' || prompt == 'reject') | ||||
|         - var icon = icon || (prompt == 'accept' || prompt == 'reject') | ||||
|         if icon | ||||
|             - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} | ||||
|             .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) | ||||
|                 +icon(icon, 18) | ||||
| 
 | ||||
|         code.c-code-block__content(data-prompt=icon ? null : prompt) | ||||
|         code.c-code-block__content(class=wrap ? "u-wrap" : null data-prompt=icon ? null : prompt) | ||||
|             block | ||||
| 
 | ||||
| 
 | ||||
| //- Code blocks to display old/new versions | ||||
| 
 | ||||
| mixin code-old() | ||||
|     +code(false, false, "reject").o-block-small | ||||
|     +code(false, false, false, false, "reject").o-block-small | ||||
|         block | ||||
| 
 | ||||
| mixin code-new() | ||||
|     +code(false, false, "accept").o-block-small | ||||
|     +code(false, false, false, false, "accept").o-block-small | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
|  | @ -138,12 +165,33 @@ mixin code-new() | |||
| 
 | ||||
| mixin codepen(slug, height, default_tab) | ||||
|     figure.o-block(style="min-height: #{height}px")&attributes(attributes) | ||||
|         .codepen(data-height=height data-theme-id="26467" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) | ||||
|         .codepen(data-height=height data-theme-id="31335" data-slug-hash=slug data-default-tab=(default_tab || "result") data-embed-version="2" data-user=SOCIAL.codepen) | ||||
|             +a("https://codepen.io/" + SOCIAL.codepen + "/" + slug) View on CodePen | ||||
| 
 | ||||
|         script(async src="https://assets.codepen.io/assets/embed/ei.js") | ||||
| 
 | ||||
| 
 | ||||
| //- GitHub embed | ||||
|     repo     - [string] repository owned by explosion organization | ||||
|     file     - [string] logical path to file, relative to repository root | ||||
|     alt_file - [string] alternative file path used in footer and link button | ||||
|     height   - [integer] height of code preview in px | ||||
| 
 | ||||
| mixin github(repo, file, alt_file, height) | ||||
|     - var branch = ALPHA ? "develop" : "master" | ||||
|     - var height = height || 250 | ||||
| 
 | ||||
|     figure.o-block | ||||
|         pre.c-code-block.o-block-small(class="lang-#{(language || DEFAULT_SYNTAX)}" style="height: #{height}px; min-height: #{height}px") | ||||
|             code.c-code-block__content(data-gh-embed="#{repo}/#{branch}/#{file}") | ||||
| 
 | ||||
|         footer.o-grid.u-text | ||||
|             .o-block-small.u-flex-full #[+icon("github")] #[code=repo + '/' + (alt_file || file)] | ||||
|             div | ||||
|                 +button(gh(repo, alt_file || file), false, "primary", "small") View on GitHub | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| //- Images / figures | ||||
|     url     - [string] url or path to image | ||||
|     width   - [integer] image width in px, for better rendering (default: 500) | ||||
|  | @ -168,10 +216,26 @@ mixin image-caption() | |||
|         block | ||||
| 
 | ||||
| 
 | ||||
| //- Label | ||||
| //- Graphic or illustration with button | ||||
|     original - [string] Path to original image | ||||
| 
 | ||||
| mixin graphic(original) | ||||
|     +image | ||||
|         block | ||||
|         if original | ||||
|             .u-text-right | ||||
|                 +button(original, false, "secondary", "small") View large graphic | ||||
| 
 | ||||
| 
 | ||||
| //- Labels | ||||
| 
 | ||||
| mixin label() | ||||
|     .u-text-label.u-color-subtle&attributes(attributes) | ||||
|     .u-text-label.u-color-dark&attributes(attributes) | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
| mixin label-inline() | ||||
|     strong.u-text-label.u-color-dark&attributes(attributes) | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
|  | @ -188,7 +252,9 @@ mixin tag() | |||
| mixin tag-model(...capabs) | ||||
|     - var intro = "To use this functionality, spaCy needs a model to be installed" | ||||
|     - var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : "" | ||||
|     +tag Requires model | ||||
| 
 | ||||
|     span.u-nowrap | ||||
|         +tag Needs model | ||||
|         +help(intro + ext + ".").u-color-theme | ||||
| 
 | ||||
| 
 | ||||
|  | @ -219,13 +285,7 @@ mixin list(type, start) | |||
| 
 | ||||
| //- List item (only used within +list) | ||||
| 
 | ||||
| mixin item(procon) | ||||
|     if procon | ||||
|         li&attributes(attributes) | ||||
|             +procon(procon).c-list__icon | ||||
|             block | ||||
| 
 | ||||
|     else | ||||
| mixin item() | ||||
|     li.c-list__item&attributes(attributes) | ||||
|         block | ||||
| 
 | ||||
|  | @ -237,9 +297,9 @@ mixin table(head) | |||
|     table.c-table.o-block&attributes(attributes) | ||||
| 
 | ||||
|         if head | ||||
|             +row | ||||
|             +row("head") | ||||
|                 each column in head | ||||
|                     th.c-table__head-cell.u-text-label=column | ||||
|                     +head-cell=column | ||||
| 
 | ||||
|         block | ||||
| 
 | ||||
|  | @ -251,10 +311,11 @@ mixin row(...style) | |||
|         block | ||||
| 
 | ||||
| 
 | ||||
| //- Footer table row (only ued within +table) | ||||
| 
 | ||||
| mixin footrow() | ||||
|     tr.c-table__row.c-table__row--foot&attributes(attributes) | ||||
| //- Header table cell (only used within +row) | ||||
| 
 | ||||
| mixin head-cell() | ||||
|     th.c-table__head-cell.u-text-label&attributes(attributes) | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
|  | @ -285,70 +346,57 @@ mixin grid-col(width) | |||
| 
 | ||||
| //- Card (only used within +grid) | ||||
|     title  - [string] card title | ||||
|     details   - [object] url, image, author, description, tags etc. | ||||
|                 (see /docs/usage/_data.json) | ||||
| 
 | ||||
| mixin card(title, details) | ||||
|     +grid-col("half").o-card.u-text&attributes(attributes) | ||||
|         if details.image | ||||
|             +a(details.url).o-block-small | ||||
|                 img(src=details.image alt=title width="300" role="presentation") | ||||
|     url    - [string] link for card | ||||
|     author - [string] optional author, displayed as byline at the bottom | ||||
|     icon   - [string] optional ID of icon displayed with card | ||||
|     width  - [string] optional width of grid column, defaults to "half" | ||||
| 
 | ||||
| mixin card(title, url, author, icon, width) | ||||
|     +grid-col(width || "half").o-box.o-grid.o-grid--space.u-text&attributes(attributes) | ||||
|         +a(url) | ||||
|             h4.u-heading.u-text-label | ||||
|                 if icon | ||||
|                     +icon(icon, 25).u-float-right | ||||
|                 if title | ||||
|             +a(details.url) | ||||
|                 +h(3)=title | ||||
|                     span.u-color-dark=title | ||||
|             .o-block-small.u-text-small | ||||
|                 block | ||||
|         if author | ||||
|             .u-color-subtle.u-text-tiny by #{author} | ||||
| 
 | ||||
|                     if details.author | ||||
|                         .u-text-small.u-color-subtle by #{details.author} | ||||
| 
 | ||||
|         if details.description || details.tags | ||||
|             ul | ||||
|                 if details.description | ||||
|                     li=details.description | ||||
| 
 | ||||
|                 if details.tags | ||||
|                     li | ||||
|                         each tag in details.tags | ||||
|                             span.u-text-tag #{tag} | ||||
|                             |   | ||||
| //- Table of contents, to be used with +item mixins for links | ||||
|     col - [string] width of column (see +grid-col) | ||||
| 
 | ||||
| mixin table-of-contents(col) | ||||
|     +grid-col(col || "half") | ||||
|         +infobox | ||||
|             +label.o-block-small Table of contents | ||||
|             +list("numbers").u-text-small.o-no-block | ||||
|                 block | ||||
| 
 | ||||
| 
 | ||||
| //- Simpler card list item (only used within +list) | ||||
|     title     - [string] card title | ||||
|     details   - [object] url, image, author, description, tags etc. | ||||
|                 (see /docs/usage/_data.json) | ||||
| //- Bibliography | ||||
|     id - [string] ID of bibliography component, for anchor links. Can be used if | ||||
|          there's more than one bibliography on one page. | ||||
| 
 | ||||
| mixin card-item(title, details) | ||||
|     +item&attributes(attributes) | ||||
|         +a(details.url)=title | ||||
| 
 | ||||
|         if details.description | ||||
|             br | ||||
|             span=details.description | ||||
| 
 | ||||
|         if details.author | ||||
|             br | ||||
|             span.u-text-small.u-color-subtle by #{details.author} | ||||
| mixin bibliography(id) | ||||
|     section(id=id || "bibliography") | ||||
|         +infobox | ||||
|             +label.o-block-small Bibliography | ||||
|             +list("numbers").u-text-small.o-no-block | ||||
|                 block | ||||
| 
 | ||||
| 
 | ||||
| //- Table row for models table | ||||
| //- Footnote | ||||
|     id      - [string / integer] ID of footnote. | ||||
|     bib_id  - [string] ID of bibliography component, defaults to "bibliography". | ||||
|     tooltip - [string] optional text displayed as tooltip | ||||
| 
 | ||||
| mixin model-row(name, lang, procon, size, license, default_model, divider) | ||||
|     - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } | ||||
| 
 | ||||
|     +row(divider ? "divider": null) | ||||
|         +cell #[code=name] | ||||
|             if default_model | ||||
|                 |  #[span.u-color-theme(title="default model") #[+icon("star", 16)]] | ||||
|         +cell=lang | ||||
|         each icon in procon | ||||
|             +cell.u-text-center #[+procon(icon ? "pro" : "con")] | ||||
|         +cell.u-text-right=size | ||||
|         +cell | ||||
|             if license in licenses | ||||
|                 +a(licenses[license])=license | ||||
| mixin fn(id, bib_id, tooltip) | ||||
|     sup.u-padding-small(id="bib" + id data-tooltip=tooltip) | ||||
|         span.u-text-tag | ||||
|             +a("#" + (bib_id || "bibliography")).u-hide-link #{id} | ||||
| 
 | ||||
| 
 | ||||
| //- Table rows for annotation specs | ||||
|  | @ -383,14 +431,3 @@ mixin annotation-row(annots, style) | |||
|             else | ||||
|                 +cell=cell | ||||
|         block | ||||
| 
 | ||||
| 
 | ||||
| //- Table of contents, to be used with +item mixins for links | ||||
|     col - [string] width of column (see +grid-col) | ||||
| 
 | ||||
| mixin table-of-contents(col) | ||||
|     +grid-col(col || "half") | ||||
|         +infobox | ||||
|             +label.o-block-small Table of contents | ||||
|             +list("numbers").u-text-small.o-no-block | ||||
|                 block | ||||
|  |  | |||
|  | @ -1,19 +1,15 @@ | |||
| //- 💫 INCLUDES > TOP NAVIGATION | ||||
| 
 | ||||
| include _mixins | ||||
| 
 | ||||
| nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) | ||||
|     a(href='/') #[+logo] | ||||
| 
 | ||||
|     if SUBSECTION != "index" | ||||
|         .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION | ||||
|     a(href="/" aria-label=SITENAME) #[+logo] | ||||
| 
 | ||||
|     ul.c-nav__menu | ||||
|         - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION | ||||
| 
 | ||||
|         each url, item in NAV | ||||
|             li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null) | ||||
|         - var current_url = '/' + current.path[0] | ||||
|         each url, item in NAVIGATION | ||||
|             li.c-nav__menu__item(class=(current_url == url) ? "is-active" : null) | ||||
|                 +a(url)=item | ||||
| 
 | ||||
|         li.c-nav__menu__item | ||||
|             +a(gh("spaCy"))(aria-label="GitHub").u-hidden-xs #[+icon("github", 20)] | ||||
|         li.c-nav__menu__item.u-hidden-xs | ||||
|             +a(gh("spaCy"))(aria-label="GitHub") #[+icon("github", 20)] | ||||
| 
 | ||||
|     progress.c-progress.js-progress(value="0" max="1") | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| //- 💫 INCLUDES > NEWSLETTER | ||||
| 
 | ||||
| ul.o-block | ||||
| ul.o-block-small | ||||
|     li.u-text-label.u-color-subtle Stay in the loop! | ||||
|     li Receive updates about new releases, tutorials and more. | ||||
| 
 | ||||
|  | @ -10,7 +10,6 @@ form.o-grid#mc-embedded-subscribe-form(action="//#{MAILCHIMP.user}.list-manage.c | |||
|     div(style="position: absolute; left: -5000px;" aria-hidden="true") | ||||
|         input(type="text" name="b_#{MAILCHIMP.id}_#{MAILCHIMP.list}" tabindex="-1" value="") | ||||
| 
 | ||||
|     .o-grid-col.u-border.u-padding-small | ||||
|         input#mce-EMAIL.u-text(type="email" name="EMAIL" placeholder="Your email") | ||||
| 
 | ||||
|         button#mc-embedded-subscribe.u-text-label.u-color-theme(type="submit" name="subscribe") Sign up | ||||
|     .o-grid-col.o-grid.o-grid--nowrap.o-field.u-padding-small | ||||
|         input#mce-EMAIL.o-field__input.u-text(type="email" name="EMAIL" placeholder="Your email" aria-label="Your email") | ||||
|         button#mc-embedded-subscribe.o-field__button.u-text-label.u-color-theme.u-nowrap(type="submit" name="subscribe") Sign up | ||||
|  |  | |||
|  | @ -1,47 +1,56 @@ | |||
| //- 💫 INCLUDES > DOCS PAGE TEMPLATE | ||||
| 
 | ||||
| - sidebar_content = (SUBSECTION != "index") ? public.docs[SUBSECTION]._data.sidebar : public.docs._data.sidebar || FOOTER | ||||
| - sidebar_content = (public[SECTION] ? public[SECTION]._data.sidebar : public._data[SECTION] ? public._data[SECTION].sidebar : false) || FOOTER | ||||
| 
 | ||||
| include _sidebar | ||||
| 
 | ||||
| main.o-main.o-main--sidebar.o-main--aside | ||||
|     article.o-content | ||||
|         +grid.o-no-block | ||||
|             +grid-col(source ? "two-thirds" : "full") | ||||
|                 +h(1)=title | ||||
|             +h(1).u-heading--title=title.replace("'", "’") | ||||
|                 if tag | ||||
|                     +tag=tag | ||||
|                 if tag_new | ||||
|                     +tag-new(tag_new) | ||||
| 
 | ||||
|                 if teaser | ||||
|                     .u-heading__teaser.u-text-small.u-color-dark=teaser | ||||
|                 else if IS_MODELS | ||||
|                     .u-heading__teaser.u-text-small.u-color-dark | ||||
|                         |  Available statistical models for | ||||
|                         |  #[code=current.source] (#{LANGUAGES[current.source]}). | ||||
| 
 | ||||
|             if source | ||||
|                 +grid-col("third").u-text-right | ||||
|                     .o-inline-list | ||||
|                         +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] | ||||
|                 .o-block.u-text-right | ||||
|                     +button(gh("spacy", source), false, "secondary", "small").u-nowrap | ||||
|                         |  Source #[+icon("code", 14)] | ||||
| 
 | ||||
|         //-if ALPHA | ||||
|         //-    +alpha-info | ||||
| 
 | ||||
|         if ALPHA | ||||
|             +infobox("⚠️ You are viewing the spaCy v2.0.0 alpha docs") | ||||
|                 strong This page is part of the alpha documentation for spaCy v2.0. | ||||
|                 |  It does not reflect the state of the latest stable release. | ||||
|                 |  Because v2.0 is still under development, the implementation | ||||
|                 |  may differ from the intended state described here. See the | ||||
|                 |  #[+a(gh("spaCy") + "/releases/tag/v2.0.0-alpha") release notes] | ||||
|                 |  for details on how to install and test the new version. To | ||||
|                 |  read the official docs for spaCy v1.x, | ||||
|                 |  #[+a("https://spacy.io/docs") go here]. | ||||
| 
 | ||||
|         if IS_MODELS | ||||
|             include _page_models | ||||
|         else | ||||
|             !=yield | ||||
| 
 | ||||
|     +grid.o-content.u-text | ||||
|         +grid-col("half") | ||||
|             if next && public.docs[SUBSECTION]._data[next] | ||||
|                 - data = public.docs[SUBSECTION]._data[next] | ||||
| 
 | ||||
|             if !IS_MODELS | ||||
|                 .o-inline-list | ||||
|                     span #[strong.u-text-label Read next:] #[+a(next).u-link=data.title] | ||||
|                     +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary", "small") | ||||
|                         |  #[span.o-icon Suggest edits] #[+icon("code", 14)] | ||||
| 
 | ||||
|         +grid-col("half").u-text-right | ||||
|             .o-inline-list | ||||
|                 +button(gh("spacy", "website/" + current.path.join('/') + ".jade"), false, "secondary").u-text-tag Suggest edits #[+icon("code", 14)] | ||||
|             if next && public[SECTION]._data[next] | ||||
|                 - data = public[SECTION]._data[next] | ||||
| 
 | ||||
|                 +grid("vcenter") | ||||
|                     +a(next).u-text-small.u-flex-full | ||||
|                         h4.u-text-label.u-color-dark Read next | ||||
|                         |  #{data.title} | ||||
| 
 | ||||
|                     +a(next).c-icon-button.c-icon-button--right(aria-hidden="true") | ||||
|                         +icon("arrow-right", 24) | ||||
| 
 | ||||
|     +gitter("spaCy chat") | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										77
									
								
								website/_includes/_page_models.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								website/_includes/_page_models.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| //- 💫 INCLUDES > MODELS PAGE TEMPLATE | ||||
| 
 | ||||
| for id in CURRENT_MODELS | ||||
|     +section(id) | ||||
|         +grid("vcenter").o-no-block(id=id) | ||||
|             +grid-col("two-thirds") | ||||
|                 +h(2) | ||||
|                     +a("#" + id).u-permalink=id | ||||
| 
 | ||||
|             +grid-col("third").u-text-right | ||||
|                 .u-color-subtle.u-text-tiny | ||||
|                     +button(gh("spacy-models") + "/releases", true, "secondary", "small")(data-tpl=id data-tpl-key="download") | ||||
|                         |  Release details | ||||
|                     .u-padding-small Latest: #[code(data-tpl=id data-tpl-key="version") n/a] | ||||
| 
 | ||||
|         +aside-code("Installation", "bash", "$"). | ||||
|             spacy download #{id} | ||||
| 
 | ||||
|         - var comps = getModelComponents(id) | ||||
| 
 | ||||
|         p(data-tpl=id data-tpl-key="description") | ||||
| 
 | ||||
|         div(data-tpl=id data-tpl-key="error" style="display: none") | ||||
|             +infobox | ||||
|                 |  Unable to load model details from GitHub. To find out more | ||||
|                 |  about this model, see the overview of the | ||||
|                 |  #[+a(gh("spacy-models") + "/releases") latest model releases]. | ||||
| 
 | ||||
|         +table(data-tpl=id data-tpl-key="table") | ||||
|             +row | ||||
|                 +cell #[+label Language] | ||||
|                 +cell #[+tag=comps.lang] #{LANGUAGES[comps.lang]} | ||||
|             for comp, label in {"Type": comps.type, "Genre": comps.genre} | ||||
|                 +row | ||||
|                     +cell #[+label=label] | ||||
|                     +cell #[+tag=comp] #{MODEL_META[comp]} | ||||
|             +row | ||||
|                 +cell #[+label Size] | ||||
|                 +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]] | ||||
| 
 | ||||
|             each label in ["Pipeline", "Sources", "Author", "License"] | ||||
|                 - var field = label.toLowerCase() | ||||
|                 +row | ||||
|                     +cell.u-nowrap | ||||
|                         +label=label | ||||
|                             if MODEL_META[field] | ||||
|                                 |  #[+help(MODEL_META[field]).u-color-subtle] | ||||
|                     +cell | ||||
|                         span(data-tpl=id data-tpl-key=field) #[em n/a] | ||||
| 
 | ||||
|             +row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none") | ||||
|                 +cell | ||||
|                     +label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle] | ||||
|                 +cell | ||||
|                     .o-field.u-float-left | ||||
|                         select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat") | ||||
|                     .o-empty(data-tpl=id data-tpl-key="compat-versions")   | ||||
| 
 | ||||
|         section(data-tpl=id data-tpl-key="accuracy-wrapper" style="display: none") | ||||
|             +grid.o-no-block | ||||
|                 +grid-col("third") | ||||
|                     +h(4) Accuracy | ||||
|                     +table.o-block-small | ||||
|                         for label, field in MODEL_ACCURACY | ||||
|                             +row(style="display: none") | ||||
|                                 +cell.u-nowrap | ||||
|                                     +label=label | ||||
|                                         if MODEL_META[field] | ||||
|                                             |  #[+help(MODEL_META[field]).u-color-subtle] | ||||
|                                 +cell.u-text-right(data-tpl=id data-tpl-key=field) | ||||
|                                     |  n/a | ||||
| 
 | ||||
|                 +grid-col("two-thirds") | ||||
|                     +h(4) Comparison | ||||
|                     +chart(id).u-padding-small | ||||
| 
 | ||||
|         p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes") | ||||
|  | @ -1,27 +1,46 @@ | |||
| //- 💫 INCLUDES > SCRIPTS | ||||
| 
 | ||||
| script(src="/assets/js/main.js?v#{V_JS}") | ||||
| script(src="/assets/js/prism.js") | ||||
| if quickstart | ||||
|         script(src="/assets/js/quickstart.min.js") | ||||
| 
 | ||||
| if SECTION == "docs" | ||||
|     if quickstart | ||||
|         script(src="/assets/js/quickstart.js") | ||||
|         script var qs = new Quickstart("#qs") | ||||
| if IS_PAGE | ||||
|     script(src="/assets/js/in-view.min.js") | ||||
| 
 | ||||
|     script. | ||||
|         ((window.gitter = {}).chat = {}).options = { | ||||
|             useStyles: false, | ||||
|             activationElement: '.js-gitter-button', | ||||
|             targetElement: '.js-gitter', | ||||
|             room: '!{SOCIAL.gitter}' | ||||
|         }; | ||||
| 
 | ||||
|     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) | ||||
| if HAS_MODELS | ||||
|     script(src="/assets/js/chart.min.js") | ||||
| 
 | ||||
| if environment == "deploy" | ||||
|     script | ||||
|     script(async src="https://www.google-analytics.com/analytics.js") | ||||
| 
 | ||||
| script(src="/assets/js/prism.min.js") | ||||
| script(src="/assets/js/main.js?v#{V_JS}") | ||||
| 
 | ||||
| script | ||||
|     | new ProgressBar('.js-progress'); | ||||
| 
 | ||||
|     if changelog | ||||
|         | new Changelog('!{SOCIAL.github}', 'spacy'); | ||||
| 
 | ||||
|     if quickstart | ||||
|         | new Quickstart("#qs"); | ||||
| 
 | ||||
|     if IS_PAGE | ||||
|         | new SectionHighlighter('data-section', 'data-nav'); | ||||
|         | new GitHubEmbed('!{SOCIAL.github}', 'data-gh-embed'); | ||||
|         | ((window.gitter = {}).chat = {}).options = { | ||||
|         |     useStyles: false, | ||||
|         |     activationElement: '.js-gitter-button', | ||||
|         |     targetElement: '.js-gitter', | ||||
|         |     room: '!{SOCIAL.gitter}' | ||||
|         | }; | ||||
| 
 | ||||
|     if HAS_MODELS | ||||
|         | new ModelLoader('!{MODELS_REPO}', !{JSON.stringify(CURRENT_MODELS)}, !{JSON.stringify(MODEL_LICENSES)}, !{JSON.stringify(MODEL_ACCURACY)}); | ||||
| 
 | ||||
|     if environment == "deploy" | ||||
|         | window.ga=window.ga||function(){ | ||||
|         | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | ||||
|         | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); | ||||
| 
 | ||||
|     script(async src="https://www.google-analytics.com/analytics.js") | ||||
| if IS_PAGE | ||||
|     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) | ||||
|  |  | |||
|  | @ -1,13 +1,23 @@ | |||
| //- 💫 INCLUDES > SIDEBAR | ||||
| 
 | ||||
| include _mixins | ||||
| 
 | ||||
| menu.c-sidebar.js-sidebar.u-text | ||||
|     if sidebar_content | ||||
|         each items, menu in sidebar_content | ||||
|             ul.c-sidebar__section.o-block | ||||
|                 li.u-text-label.u-color-subtle=menu | ||||
|         each items, sectiontitle in sidebar_content | ||||
|             ul.c-sidebar__section.o-block-small | ||||
|                 li.u-text-label.u-color-dark=sectiontitle | ||||
| 
 | ||||
|                 each url, item in items | ||||
|                     li(class=(CURRENT == url || (CURRENT == "index" && url == "./")) ? "is-active" : null) | ||||
|                         +a(url)=item | ||||
|                     - var is_current = CURRENT == url || (CURRENT == "index" && url == "./") | ||||
|                     li.c-sidebar__item | ||||
|                         +a(url)(class=is_current ? "is-active" : null)=item | ||||
| 
 | ||||
|                         if is_current | ||||
|                             if IS_MODELS && CURRENT_MODELS.length | ||||
|                                 - menu = Object.assign({}, ...CURRENT_MODELS.map(id => ({ [id]: id }))) | ||||
|                             if menu | ||||
|                                 ul.c-sidebar__crumb.u-hidden-sm | ||||
|                                     - var counter = 0 | ||||
|                                     for id, title in menu | ||||
|                                         - counter++ | ||||
|                                         li.c-sidebar__crumb__item(data-nav=id class=(counter == 1) ? "is-active" : null) | ||||
|                                             +a("#section-" + id)=title | ||||
|  |  | |||
							
								
								
									
										157
									
								
								website/_includes/_svg.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								website/_includes/_svg.jade
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							|  | @ -2,11 +2,16 @@ | |||
| 
 | ||||
| include _includes/_mixins | ||||
| 
 | ||||
| - title = IS_MODELS ? LANGUAGES[current.source] || title : title | ||||
| - social_title = (SECTION == "index") ? SITENAME + " - " + SLOGAN : title + " - " + SITENAME | ||||
| - social_img = SITE_URL + "/assets/img/social/preview_" + (preview || ALPHA ? "alpha" : "default") + ".jpg" | ||||
| 
 | ||||
| doctype html | ||||
| html(lang="en") | ||||
|     title | ||||
|         if SECTION == "docs" && SUBSECTION && SUBSECTION != "index" | ||||
|             | #{title} | #{SITENAME} #{SUBSECTION == "api" ? "API" : "Usage"} Documentation | ||||
|         if SECTION == "api" || SECTION == "usage" || SECTION == "models" | ||||
|             - var title_section = (SECTION == "api") ? "API" : SECTION.charAt(0).toUpperCase() + SECTION.slice(1) | ||||
|             | #{title} | #{SITENAME} #{title_section} Documentation | ||||
| 
 | ||||
|         else if SECTION != "index" | ||||
|             | #{title} | #{SITENAME} | ||||
|  | @ -22,32 +27,30 @@ html(lang="en") | |||
|     meta(property="og:type" content="website") | ||||
|     meta(property="og:site_name" content=sitename) | ||||
|     meta(property="og:url" content="#{SITE_URL}/#{current.path.join('/')}") | ||||
|     meta(property="og:title" content="#{title} - spaCy") | ||||
|     meta(property="og:title" content=social_title) | ||||
|     meta(property="og:description" content=description) | ||||
|     meta(property="og:image" content=getSocialImg()) | ||||
|     meta(property="og:image" content=social_img) | ||||
| 
 | ||||
|     meta(name="twitter:card" content="summary_large_image") | ||||
|     meta(name="twitter:site" content="@" + SOCIAL.twitter) | ||||
|     meta(name="twitter:title" content="#{title} - spaCy") | ||||
|     meta(name="twitter:title" content=social_title) | ||||
|     meta(name="twitter:description" content=description) | ||||
|     meta(name="twitter:image" content=getSocialImg()) | ||||
|     meta(name="twitter:image" content=social_img) | ||||
| 
 | ||||
|     link(rel="shortcut icon" href="/assets/img/favicon.ico") | ||||
|     link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") | ||||
| 
 | ||||
|     if ALPHA && SECTION == "docs" | ||||
|     if SECTION == "api" | ||||
|         link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet") | ||||
| 
 | ||||
|     else if SUBSECTION == "usage" | ||||
|         link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet") | ||||
| 
 | ||||
|     else | ||||
|         link(href="/assets/css/style.css?v#{V_CSS}" rel="stylesheet") | ||||
| 
 | ||||
|     body | ||||
|         include _includes/_svg | ||||
|         include _includes/_navigation | ||||
| 
 | ||||
|         if SECTION == "docs" | ||||
|         if !landing | ||||
|             include _includes/_page-docs | ||||
| 
 | ||||
|         else | ||||
|  |  | |||
							
								
								
									
										43
									
								
								website/api/_annotation/_biluo.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								website/api/_annotation/_biluo.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,43 @@ | |||
| //- 💫 DOCS > API > ANNOTATION > BILUO | ||||
| 
 | ||||
| +table([ "Tag", "Description" ]) | ||||
|     +row | ||||
|         +cell #[code #[span.u-color-theme B] EGIN] | ||||
|         +cell The first token of a multi-token entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code #[span.u-color-theme I] N] | ||||
|         +cell An inner token of a multi-token entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code #[span.u-color-theme L] AST] | ||||
|         +cell The final token of a multi-token entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code #[span.u-color-theme U] NIT] | ||||
|         +cell A single-token entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code #[span.u-color-theme O] UT] | ||||
|         +cell A non-entity token. | ||||
| 
 | ||||
| +aside("Why BILUO, not IOB?") | ||||
|     |  There are several coding schemes for encoding entity annotations as | ||||
|     |  token tags.  These coding schemes are equally expressive, but not | ||||
|     |  necessarily equally learnable. | ||||
|     |  #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth] | ||||
|     |  showed that the minimal #[strong Begin], #[strong In], #[strong Out] | ||||
|     |  scheme was more difficult to learn than the #[strong BILUO] scheme that | ||||
|     |  we use, which explicitly marks boundary tokens. | ||||
| 
 | ||||
| p | ||||
|     |  spaCy translates the character offsets into this scheme, in order to | ||||
|     |  decide the cost of each action given the current state of the entity | ||||
|     |  recogniser. The costs are then used to calculate the gradient of the | ||||
|     |  loss, to train the model. The exact algorithm is a pastiche of | ||||
|     |  well-known methods, and is not currently described in any single | ||||
|     |  publication. The model is a greedy transition-based parser guided by a | ||||
|     |  linear model whose weights are learned using the averaged perceptron | ||||
|     |  loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] | ||||
|     |  imitation learning strategy. The transition system is equivalent to the | ||||
|     |  BILOU tagging scheme. | ||||
							
								
								
									
										115
									
								
								website/api/_architecture/_cython.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								website/api/_architecture/_cython.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,115 @@ | |||
| //- 💫 DOCS > API > ARCHITECTURE > CYTHON | ||||
| 
 | ||||
| +aside("What's Cython?") | ||||
|     |  #[+a("http://cython.org/") Cython] is a language for writing | ||||
|     |  C extensions for Python. Most Python code is also valid Cython, but | ||||
|     |  you can add type declarations to get efficient memory-managed code | ||||
|     |  just like C or C++. | ||||
| 
 | ||||
| p | ||||
|     |  spaCy's core data structures are implemented as | ||||
|     |  #[+a("http://cython.org/") Cython] #[code cdef] classes. Memory is | ||||
|     |  managed through the #[+a(gh("cymem")) #[code cymem]] | ||||
|     |  #[code cymem.Pool] class, which allows you | ||||
|     |  to allocate memory which will be freed when the #[code Pool] object | ||||
|     |  is garbage collected. This means you usually don't have to worry | ||||
|     |  about freeing memory. You just have to decide which Python object | ||||
|     |  owns the memory, and make it own the #[code Pool]. When that object | ||||
|     |  goes out of scope, the memory will be freed. You do have to take | ||||
|     |  care that no pointers outlive the object that owns them — but this | ||||
|     |  is generally quite easy. | ||||
| 
 | ||||
| p | ||||
|     |  All Cython modules should have the #[code # cython: infer_types=True] | ||||
|     |  compiler directive at the top of the file. This makes the code much | ||||
|     |  cleaner, as it avoids the need for many type declarations. If | ||||
|     |  possible, you should prefer to declare your functions #[code nogil], | ||||
|     |  even if you don't especially care about multi-threading. The reason | ||||
|     |  is that #[code nogil] functions help the Cython compiler reason about | ||||
|     |  your code quite a lot — you're telling the compiler that no Python | ||||
|     |  dynamics are possible. This lets many errors be raised, and ensures | ||||
|     |  your function will run at C speed. | ||||
| 
 | ||||
| 
 | ||||
| p | ||||
|     |  Cython gives you many choices of sequences: you could have a Python | ||||
|     |  list, a numpy array, a memory view, a C++ vector, or a pointer. | ||||
|     |  Pointers are preferred, because they are fastest, have the most | ||||
|     |  explicit semantics, and let the compiler check your code more | ||||
|     |  strictly. C++ vectors are also great — but you should only use them | ||||
|     |  internally in functions. It's less friendly to accept a vector as an | ||||
|     |  argument, because that asks the user to do much more work. Here's | ||||
|     |  how to get a pointer from a numpy array, memory view or vector: | ||||
| 
 | ||||
| +code. | ||||
|     cdef void get_pointers(np.ndarray[int, mode='c'] numpy_array, vector[int] cpp_vector, int[::1] memory_view) nogil: | ||||
|     pointer1 = <int*>numpy_array.data | ||||
|     pointer2 = cpp_vector.data() | ||||
|     pointer3 = &memory_view[0] | ||||
| 
 | ||||
| p | ||||
|     |  Both C arrays and C++ vectors reassure the compiler that no Python | ||||
|     |  operations are possible on your variable. This is a big advantage: | ||||
|     |  it lets the Cython compiler raise many more errors for you. | ||||
| 
 | ||||
| p | ||||
|     |  When getting a pointer from a numpy array or memoryview, take care | ||||
|     |  that the data is actually stored in C-contiguous order — otherwise | ||||
|     |  you'll get a pointer to nonsense. The type-declarations in the code | ||||
|     |  above should generate runtime errors if buffers with incorrect | ||||
|     |  memory layouts are passed in. To iterate over the array, the | ||||
|     |  following style is preferred: | ||||
| 
 | ||||
| +code. | ||||
|     cdef int c_total(const int* int_array, int length) nogil: | ||||
|         total = 0 | ||||
|         for item in int_array[:length]: | ||||
|             total += item | ||||
|         return total | ||||
| 
 | ||||
| p | ||||
|     |  If this is confusing, consider that the compiler couldn't deal with | ||||
|     |  #[code for item in int_array:] — there's no length attached to a raw | ||||
|     |  pointer, so how could we figure out where to stop? The length is | ||||
|     |  provided in the slice notation as a solution to this. Note that we | ||||
|     |  don't have to declare the type of #[code item] in the code above — | ||||
|     |  the compiler can easily infer it. This gives us tidy code that looks | ||||
|     |  quite like Python, but is exactly as fast as C — because we've made | ||||
|     |  sure the compilation to C is trivial. | ||||
| 
 | ||||
| p | ||||
|     |  Your functions cannot be declared #[code nogil] if they need to | ||||
|     |  create Python objects or call Python functions. This is perfectly | ||||
|     |  okay — you shouldn't torture your code just to get #[code nogil] | ||||
|     |  functions. However, if your function isn't #[code nogil], you should | ||||
|     |  compile your module with #[code cython -a --cplus my_module.pyx] and | ||||
|     |  open the resulting #[code my_module.html] file in a browser. This | ||||
|     |  will let you see how Cython is compiling your code. Calls into the | ||||
|     |  Python run-time will be in bright yellow. This lets you easily see | ||||
|     |  whether Cython is able to correctly type your code, or whether there | ||||
|     |  are unexpected problems. | ||||
| 
 | ||||
| p | ||||
|     |  Working in Cython is very rewarding once you're over the initial | ||||
|     |  learning curve. As with C and C++, the first way you write something | ||||
|     |  in Cython will often be the performance-optimal approach. In | ||||
|     |  contrast, Python optimisation generally requires a lot of | ||||
|     |  experimentation. Is it faster to have an #[code if item in my_dict] | ||||
|     |  check, or to use #[code .get()]? What about | ||||
|     |  #[code try]/#[code except]? Does this numpy operation create a copy? | ||||
|     |  There's no way to guess the answers to these questions, and you'll | ||||
|     |  usually be dissatisfied with your results — so there's no way to | ||||
|     |  know when to stop this process. In the worst case, you'll make a | ||||
|     |  mess that invites the next reader to try their luck too. This is | ||||
|     |  like one of those | ||||
|     |  #[+a("http://www.wemjournal.org/article/S1080-6032%2809%2970088-2/abstract") volcanic gas-traps], | ||||
|     |  where the rescuers keep passing out from low oxygen, causing | ||||
|     |  another rescuer to follow — only to succumb themselves. In short, | ||||
|     |  just say no to optimizing your Python. If it's not fast enough the | ||||
|     |  first time, just switch to Cython. | ||||
| 
 | ||||
| +infobox("Resources") | ||||
|     +list.o-no-block | ||||
|         +item #[+a("http://docs.cython.org/en/latest/") Official Cython documentation] (cython.org) | ||||
|         +item #[+a("https://explosion.ai/blog/writing-c-in-cython", true) Writing C in Cython] (explosion.ai) | ||||
|         +item #[+a("https://explosion.ai/blog/multithreading-with-cython") Multi-threading spaCy’s parser and named entity recogniser] (explosion.ai) | ||||
							
								
								
									
										141
									
								
								website/api/_architecture/_nn-model.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								website/api/_architecture/_nn-model.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,141 @@ | |||
| //- 💫 DOCS > API > ARCHITECTURE > NN MODEL ARCHITECTURE | ||||
| 
 | ||||
| p | ||||
|     |  The parsing model is a blend of recent results. The two recent | ||||
|     |  inspirations have been the work of Eli Klipperwasser and Yoav Goldberg at | ||||
|     |  Bar Ilan#[+fn(1)], and the SyntaxNet team from Google. The foundation of | ||||
|     |  the parser is still based on the work of Joakim Nivre#[+fn(2)], who | ||||
|     |  introduced the transition-based framework#[+fn(3)], the arc-eager | ||||
|     |  transition system, and the imitation learning objective. The model is | ||||
|     |  implemented using #[+a(gh("thinc")) Thinc], spaCy's machine learning | ||||
|     |  library. We first predict context-sensitive vectors for each word in the | ||||
|     |  input: | ||||
| 
 | ||||
| +code. | ||||
|     (embed_lower | embed_prefix | embed_suffix | embed_shape) | ||||
|         >> Maxout(token_width) | ||||
|         >> convolution ** 4 | ||||
| 
 | ||||
| p | ||||
|     |  This convolutional layer is shared between the tagger, parser and NER, | ||||
|     |  and will also be shared by the future neural lemmatizer. Because the | ||||
|     |  parser shares these layers with the tagger, the parser does not require | ||||
|     |  tag features. I got this trick from David Weiss's "Stack Combination" | ||||
|     |  paper#[+fn(4)]. | ||||
| 
 | ||||
| p | ||||
|     |  To boost the representation, the tagger actually predicts a "super tag" | ||||
|     |  with POS, morphology and dependency label#[+fn(5)]. The tagger predicts | ||||
|     |  these supertags by adding a softmax layer onto the convolutional layer – | ||||
|     |  so, we're teaching the convolutional layer to give us a representation | ||||
|     |  that's one affine transform from this informative lexical information. | ||||
|     |  This is obviously good for the parser (which backprops to the | ||||
|     |  convolutions too). The parser model makes a state vector by concatenating | ||||
|     |  the vector representations for its context tokens.  The current context | ||||
|     |  tokens: | ||||
| 
 | ||||
| +table | ||||
|     +row | ||||
|         +cell #[code S0], #[code S1], #[code S2] | ||||
|         +cell Top three words on the stack. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code B0], #[code B1] | ||||
|         +cell First two words of the buffer. | ||||
| 
 | ||||
|     +row | ||||
|         +cell.u-nowrap | ||||
|             |  #[code S0L1], #[code S1L1], #[code S2L1], #[code B0L1], | ||||
|             |  #[code B1L1]#[br] | ||||
|             |  #[code S0L2], #[code S1L2], #[code S2L2], #[code B0L2], | ||||
|             |  #[code B1L2] | ||||
|         +cell | ||||
|             |  Leftmost and second leftmost children of #[code S0], #[code S1], | ||||
|             |  #[code S2], #[code B0] and #[code B1]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell.u-nowrap | ||||
|             |  #[code S0R1], #[code S1R1], #[code S2R1], #[code B0R1], | ||||
|             |  #[code B1R1]#[br] | ||||
|             |  #[code S0R2], #[code S1R2], #[code S2R2], #[code B0R2], | ||||
|             |  #[code B1R2] | ||||
|         +cell | ||||
|             |  Rightmost and second rightmost children of #[code S0], #[code S1], | ||||
|             |  #[code S2], #[code B0] and #[code B1]. | ||||
| 
 | ||||
| p | ||||
|     |  This makes the state vector quite long: #[code 13*T], where #[code T] is | ||||
|     |  the token vector width (128 is working well). Fortunately, there's a way | ||||
|     |  to structure the computation to save some expense (and make it more | ||||
|     |  GPU-friendly). | ||||
| 
 | ||||
| p | ||||
|     |  The parser typically visits #[code 2*N] states for a sentence of length | ||||
|     |  #[code N] (although it may visit more, if it back-tracks with a | ||||
|     |  non-monotonic transition#[+fn(4)]). A naive implementation would require | ||||
|     |  #[code 2*N (B, 13*T) @ (13*T, H)] matrix multiplications for a batch of | ||||
|     |  size #[code B]. We can instead perform one #[code (B*N, T) @ (T, 13*H)] | ||||
|     |  multiplication, to pre-compute the hidden weights for each positional | ||||
|     |  feature with respect to the words in the batch. (Note that our token | ||||
|     |  vectors come from the CNN — so we can't play this trick over the | ||||
|     |  vocabulary. That's how Stanford's NN parser#[+fn(3)] works — and why its | ||||
|     |  model is so big.) | ||||
| 
 | ||||
| p | ||||
|     |  This pre-computation strategy allows a nice compromise between | ||||
|     |  GPU-friendliness and implementation simplicity. The CNN and the wide | ||||
|     |  lower layer are computed on the GPU, and then the precomputed hidden | ||||
|     |  weights are moved to the CPU, before we start the transition-based | ||||
|     |  parsing process. This makes a lot of things much easier. We don't have to | ||||
|     |  worry about variable-length batch sizes, and we don't have to implement | ||||
|     |  the dynamic oracle in CUDA to train. | ||||
| 
 | ||||
| p | ||||
|     |  Currently the parser's loss function is multilabel log loss#[+fn(6)], as | ||||
|     |  the dynamic oracle allows multiple states to be 0 cost. This is defined | ||||
|     |  as follows, where #[code gZ] is the sum of the scores assigned to gold | ||||
|     |  classes: | ||||
| 
 | ||||
| +code. | ||||
|     (exp(score) / Z) - (exp(score) / gZ) | ||||
| 
 | ||||
| +bibliography | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41") Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations] | ||||
|         br | ||||
|         |  Eliyahu Kiperwasser, Yoav Goldberg. (2016) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4") A Dynamic Oracle for Arc-Eager Dependency Parsing] | ||||
|         br | ||||
|         |  Yoav Goldberg, Joakim Nivre (2012) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://explosion.ai/blog/parsing-english-in-python") Parsing English in 500 Lines of Python] | ||||
|         br | ||||
|         |  Matthew Honnibal (2013) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466") Stack-propagation: Improved Representation Learning for Syntax] | ||||
|         br | ||||
|         |  Yuan Zhang, David Weiss (2016) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86") Deep multi-task learning with low level tasks supervised at lower layers] | ||||
|         br | ||||
|         |  Anders Søgaard, Yoav Goldberg (2016) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c") An Improved Non-monotonic Transition System for Dependency Parsing] | ||||
|         br | ||||
|         |  Matthew Honnibal, Mark Johnson (2015) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf") A Fast and Accurate Dependency Parser using Neural Networks] | ||||
|         br | ||||
|         |  Danqi Cheng, Christopher D. Manning (2014) | ||||
| 
 | ||||
|     +item | ||||
|         |  #[+a("https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2") Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques] | ||||
|         br | ||||
|         |  Stefan Riezler et al. (2002) | ||||
|  | @ -1,29 +1,32 @@ | |||
| { | ||||
|     "sidebar": { | ||||
|         "Introduction": { | ||||
|             "Facts & Figures": "./", | ||||
|             "Languages": "language-models", | ||||
|             "Annotation Specs": "annotation" | ||||
|         "Overview": { | ||||
|             "Architecture": "./", | ||||
|             "Annotation Specs": "annotation", | ||||
|             "Functions": "top-level" | ||||
|         }, | ||||
|         "Top-level": { | ||||
|             "spacy": "spacy", | ||||
|             "displacy": "displacy", | ||||
|             "Utility Functions": "util", | ||||
|             "Command line": "cli" | ||||
|         }, | ||||
|         "Classes": { | ||||
|         "Containers": { | ||||
|             "Doc": "doc", | ||||
|             "Token": "token", | ||||
|             "Span": "span", | ||||
|             "Lexeme": "lexeme" | ||||
|         }, | ||||
| 
 | ||||
|         "Pipeline": { | ||||
|             "Language": "language", | ||||
|             "Tokenizer": "tokenizer", | ||||
|             "Pipe": "pipe", | ||||
|             "Tensorizer": "tensorizer", | ||||
|             "Tagger": "tagger", | ||||
|             "DependencyParser": "dependencyparser", | ||||
|             "EntityRecognizer": "entityrecognizer", | ||||
|             "TextCategorizer": "textcategorizer", | ||||
|             "Tokenizer": "tokenizer", | ||||
|             "Lemmatizer": "lemmatizer", | ||||
|             "Matcher": "matcher", | ||||
|             "Lexeme": "lexeme", | ||||
|             "PhraseMatcher": "phrasematcher" | ||||
|         }, | ||||
| 
 | ||||
|         "Other": { | ||||
|             "Vocab": "vocab", | ||||
|             "StringStore": "stringstore", | ||||
|             "Vectors": "vectors", | ||||
|  | @ -34,52 +37,37 @@ | |||
|     }, | ||||
| 
 | ||||
|     "index": { | ||||
|         "title": "Facts & Figures", | ||||
|         "next": "language-models" | ||||
|         "title": "Architecture", | ||||
|         "next": "annotation", | ||||
|         "menu": { | ||||
|             "Basics": "basics", | ||||
|             "Neural Network Model": "nn-model", | ||||
|             "Cython Conventions": "cython" | ||||
|         } | ||||
|     }, | ||||
| 
 | ||||
|     "language-models": { | ||||
|         "title": "Languages", | ||||
|         "next": "philosophy" | ||||
|     }, | ||||
| 
 | ||||
|     "philosophy": { | ||||
|         "title": "Philosophy" | ||||
|     }, | ||||
| 
 | ||||
|     "spacy": { | ||||
|         "title": "spaCy top-level functions", | ||||
|         "source": "spacy/__init__.py", | ||||
|         "next": "displacy" | ||||
|     }, | ||||
| 
 | ||||
|     "displacy": { | ||||
|         "title": "displaCy", | ||||
|         "tag": "module", | ||||
|         "source": "spacy/displacy", | ||||
|         "next": "util" | ||||
|     }, | ||||
| 
 | ||||
|     "util": { | ||||
|         "title": "Utility Functions", | ||||
|         "source": "spacy/util.py", | ||||
|         "next": "cli" | ||||
|     }, | ||||
| 
 | ||||
|     "cli": { | ||||
|         "title": "Command Line Interface", | ||||
|         "source": "spacy/cli" | ||||
|     "top-level": { | ||||
|         "title": "Top-level Functions", | ||||
|         "menu": { | ||||
|             "spacy": "spacy", | ||||
|             "displacy": "displacy", | ||||
|             "Utility Functions": "util", | ||||
|             "Compatibility": "compat", | ||||
|             "Command Line": "cli" | ||||
|         } | ||||
|     }, | ||||
| 
 | ||||
|     "language": { | ||||
|         "title": "Language", | ||||
|         "tag": "class", | ||||
|         "teaser": "A text-processing pipeline.", | ||||
|         "source": "spacy/language.py" | ||||
|     }, | ||||
| 
 | ||||
|     "doc": { | ||||
|         "title": "Doc", | ||||
|         "tag": "class", | ||||
|         "teaser": "A container for accessing linguistic annotations.", | ||||
|         "source": "spacy/tokens/doc.pyx" | ||||
|     }, | ||||
| 
 | ||||
|  | @ -103,6 +91,7 @@ | |||
| 
 | ||||
|     "vocab": { | ||||
|         "title": "Vocab", | ||||
|         "teaser": "A storage class for vocabulary and other data shared across a language.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/vocab.pyx" | ||||
|     }, | ||||
|  | @ -115,10 +104,27 @@ | |||
| 
 | ||||
|     "matcher": { | ||||
|         "title": "Matcher", | ||||
|         "teaser": "Match sequences of tokens, based on pattern rules.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/matcher.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "phrasematcher": { | ||||
|         "title": "PhraseMatcher", | ||||
|         "teaser": "Match sequences of tokens, based on documents.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/matcher.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "pipe": { | ||||
|         "title": "Pipe", | ||||
|         "teaser": "Abstract base class defining the API for pipeline components.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "dependenyparser": { | ||||
|         "title": "DependencyParser", | ||||
|         "tag": "class", | ||||
|  | @ -127,18 +133,22 @@ | |||
| 
 | ||||
|     "entityrecognizer": { | ||||
|         "title": "EntityRecognizer", | ||||
|         "teaser": "Annotate named entities on documents.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "textcategorizer": { | ||||
|         "title": "TextCategorizer", | ||||
|         "teaser": "Add text categorization models to spaCy pipelines.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "dependencyparser": { | ||||
|         "title": "DependencyParser", | ||||
|         "teaser": "Annotate syntactic dependencies on documents.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
|  | @ -149,15 +159,23 @@ | |||
|         "source": "spacy/tokenizer.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "lemmatizer": { | ||||
|         "title": "Lemmatizer", | ||||
|         "tag": "class" | ||||
|     }, | ||||
| 
 | ||||
|     "tagger": { | ||||
|         "title": "Tagger", | ||||
|         "teaser": "Annotate part-of-speech tags on documents.", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "tensorizer": { | ||||
|         "title": "Tensorizer", | ||||
|         "teaser": "Add a tensor with position-sensitive meaning representations to a document.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|  | @ -169,23 +187,38 @@ | |||
| 
 | ||||
|     "goldcorpus": { | ||||
|         "title": "GoldCorpus", | ||||
|         "teaser": "An annotated corpus, using the JSON file format.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/gold.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "binder": { | ||||
|         "title": "Binder", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/tokens/binder.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "vectors": { | ||||
|         "title": "Vectors", | ||||
|         "teaser": "Store, save and load word vectors.", | ||||
|         "tag": "class", | ||||
|         "tag_new": 2, | ||||
|         "source": "spacy/vectors.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "annotation": { | ||||
|         "title": "Annotation Specifications" | ||||
|         "title": "Annotation Specifications", | ||||
|         "teaser": "Schemes used for labels, tags and training data.", | ||||
|         "menu": { | ||||
|             "Tokenization": "tokenization", | ||||
|             "Sentence Boundaries": "sbd", | ||||
|             "POS Tagging": "pos-tagging", | ||||
|             "Lemmatization": "lemmatization", | ||||
|             "Dependencies": "dependency-parsing", | ||||
|             "Named Entities": "named-entities", | ||||
|             "Training Data": "training" | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | @ -1,26 +1,17 @@ | |||
| //- 💫 DOCS > USAGE > COMMAND LINE INTERFACE | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| //- 💫 DOCS > API > TOP-LEVEL > COMMAND LINE INTERFACE | ||||
| 
 | ||||
| p | ||||
|     |  As of v1.7.0, spaCy comes with new command line helpers to download and | ||||
|     |  link models and show useful debugging information. For a list of available | ||||
|     |  commands, type #[code spacy --help]. | ||||
| 
 | ||||
| +infobox("⚠️ Deprecation note") | ||||
|     |  As of spaCy 2.0, the #[code model] command to initialise a model data | ||||
|     |  directory is deprecated. The command was only necessary because previous | ||||
|     |  versions of spaCy expected a model directory to already be set up. This | ||||
|     |  has since been changed, so you can use the #[+api("cli#train") #[code train]] | ||||
|     |  command straight away. | ||||
| 
 | ||||
| +h(2, "download") Download | ||||
| +h(3, "download") Download | ||||
| 
 | ||||
| p | ||||
|     |  Download #[+a("/docs/usage/models") models] for spaCy. The downloader finds the | ||||
|     |  Download #[+a("/usage/models") models] for spaCy. The downloader finds the | ||||
|     |  best-matching compatible version, uses pip to download the model as a | ||||
|     |  package and automatically creates a | ||||
|     |  #[+a("/docs/usage/models#usage") shortcut link] to load the model by name. | ||||
|     |  #[+a("/usage/models#usage") shortcut link] to load the model by name. | ||||
|     |  Direct downloads don't perform any compatibility checks and require the | ||||
|     |  model name to be specified with its version (e.g., #[code en_core_web_sm-1.2.0]). | ||||
| 
 | ||||
|  | @ -49,15 +40,15 @@ p | |||
|     |  detailed messages in case things go wrong. It's #[strong not recommended] | ||||
|     |  to use this command as part of an automated process. If you know which | ||||
|     |  model your project needs, you should consider a | ||||
|     |  #[+a("/docs/usage/models#download-pip") direct download via pip], or | ||||
|     |  #[+a("/usage/models#download-pip") direct download via pip], or | ||||
|     |  uploading the model to a local PyPi installation and fetching it straight | ||||
|     |  from there. This will also allow you to add it as a versioned package | ||||
|     |  dependency to your project. | ||||
| 
 | ||||
| +h(2, "link") Link | ||||
| +h(3, "link") Link | ||||
| 
 | ||||
| p | ||||
|     |  Create a #[+a("/docs/usage/models#usage") shortcut link] for a model, | ||||
|     |  Create a #[+a("/usage/models#usage") shortcut link] for a model, | ||||
|     |  either a Python package or a local directory. This will let you load | ||||
|     |  models from any location using a custom name via | ||||
|     |  #[+api("spacy#load") #[code spacy.load()]]. | ||||
|  | @ -95,7 +86,7 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "info") Info | ||||
| +h(3, "info") Info | ||||
| 
 | ||||
| p | ||||
|     |  Print information about your spaCy installation, models and local setup, | ||||
|  | @ -122,15 +113,15 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "convert") Convert | ||||
| +h(3, "convert") Convert | ||||
| 
 | ||||
| p | ||||
|     |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] | ||||
|     |  Convert files into spaCy's #[+a("/api/annotation#json-input") JSON format] | ||||
|     |  for use with the #[code train] command and other experiment management | ||||
|     |  functions. The right converter is chosen based on the file extension of | ||||
|     |  the input file. Currently only supports #[code .conllu]. | ||||
| 
 | ||||
| +code(false, "bash", "$"). | ||||
| +code(false, "bash", "$", false, false, true). | ||||
|     spacy convert [input_file] [output_dir] [--n-sents] [--morphology] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|  | @ -159,14 +150,18 @@ p | |||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(2, "train") Train | ||||
| +h(3, "train") Train | ||||
| 
 | ||||
| p | ||||
|     |  Train a model. Expects data in spaCy's | ||||
|     |  #[+a("/docs/api/annotation#json-input") JSON format]. | ||||
|     |  #[+a("/api/annotation#json-input") JSON format]. On each epoch, a model | ||||
|     |  will be saved out to the directory. Accuracy scores and model details | ||||
|     |  will be added to a #[+a("/usage/training#models-generating") #[code meta.json]] | ||||
|     |  to allow packaging the model using the | ||||
|     |  #[+api("cli#package") #[code package]] command. | ||||
| 
 | ||||
| +code(false, "bash", "$"). | ||||
|     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] | ||||
| +code(false, "bash", "$", false, false, true). | ||||
|     spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--no-entities] [--gold-preproc] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -204,6 +199,27 @@ p | |||
|         +cell option | ||||
|         +cell Use GPU. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --vectors], #[code -v] | ||||
|         +cell option | ||||
|         +cell Model to load vectors from. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --meta-path], #[code -m] | ||||
|         +cell option | ||||
|         +cell | ||||
|             |  #[+tag-new(2)] Optional path to model | ||||
|             |  #[+a("/usage/training#models-generating") #[code meta.json]]. | ||||
|             |  All relevant properties like #[code lang], #[code pipeline] and | ||||
|             |  #[code spacy_version] will be overwritten. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --version], #[code -V] | ||||
|         +cell option | ||||
|         +cell | ||||
|             |  Model version. Will be written out to the model's | ||||
|             |  #[code meta.json] after training. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --no-tagger], #[code -T] | ||||
|         +cell flag | ||||
|  | @ -219,12 +235,18 @@ p | |||
|         +cell flag | ||||
|         +cell Don't train NER. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --gold-preproc], #[code -G] | ||||
|         +cell flag | ||||
|         +cell Use gold preprocessing. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --help], #[code -h] | ||||
|         +cell flag | ||||
|         +cell Show help message and available arguments. | ||||
| 
 | ||||
| +h(3, "train-hyperparams") Environment variables for hyperparameters | ||||
| +h(4, "train-hyperparams") Environment variables for hyperparameters | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  spaCy lets you set hyperparameters for training via environment variables. | ||||
|  | @ -236,98 +258,149 @@ p | |||
| +code(false, "bash"). | ||||
|     parser_hidden_depth=2 parser_maxout_pieces=1 train-parser | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| +table(["Name", "Description", "Default"]) | ||||
|     +row | ||||
|         +cell #[code dropout_from] | ||||
|         +cell | ||||
|         +cell Initial dropout rate. | ||||
|         +cell #[code 0.2] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code dropout_to] | ||||
|         +cell | ||||
|         +cell Final dropout rate. | ||||
|         +cell #[code 0.2] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code dropout_decay] | ||||
|         +cell | ||||
|         +cell Rate of dropout change. | ||||
|         +cell #[code 0.0] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code batch_from] | ||||
|         +cell | ||||
|         +cell Initial batch size. | ||||
|         +cell #[code 1] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code batch_to] | ||||
|         +cell | ||||
|         +cell Final batch size. | ||||
|         +cell #[code 64] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code batch_compound] | ||||
|         +cell | ||||
|         +cell Rate of batch size acceleration. | ||||
|         +cell #[code 1.001] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code token_vector_width] | ||||
|         +cell | ||||
|         +cell Width of embedding tables and convolutional layers. | ||||
|         +cell #[code 128] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code embed_size] | ||||
|         +cell | ||||
|         +cell Number of rows in embedding tables. | ||||
|         +cell #[code 7500] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code parser_maxout_pieces] | ||||
|         +cell | ||||
|         +cell Number of pieces in the parser's and NER's first maxout layer. | ||||
|         +cell #[code 2] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code parser_hidden_depth] | ||||
|         +cell | ||||
|         +cell Number of hidden layers in the parser and NER. | ||||
|         +cell #[code 1] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code hidden_width] | ||||
|         +cell | ||||
|         +cell Size of the parser's and NER's hidden layers. | ||||
|         +cell #[code 128] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code learn_rate] | ||||
|         +cell | ||||
|         +cell Learning rate. | ||||
|         +cell #[code 0.001] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code optimizer_B1] | ||||
|         +cell | ||||
|         +cell Momentum for the Adam solver. | ||||
|         +cell #[code 0.9] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code optimizer_B2] | ||||
|         +cell | ||||
|         +cell Adagrad-momentum for the Adam solver. | ||||
|         +cell #[code 0.999] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code optimizer_eps] | ||||
|         +cell | ||||
|         +cell Epsylon value for the Adam solver. | ||||
|         +cell #[code 1e-08] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code L2_penalty] | ||||
|         +cell | ||||
|         +cell L2 regularisation penalty. | ||||
|         +cell #[code 1e-06] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code grad_norm_clip] | ||||
|         +cell | ||||
|         +cell Gradient L2 norm constraint. | ||||
|         +cell #[code 1.0] | ||||
| 
 | ||||
| +h(2, "package") Package | ||||
| +h(3, "evaluate") Evaluate | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | ||||
|     |  Evaluate a model's accuracy and speed on JSON-formatted annotated data. | ||||
|     |  Will print the results and optionally export | ||||
|     |  #[+a("/usage/visualizers") displaCy visualizations] of a sample set of | ||||
|     |  parses to #[code .html] files. Visualizations for the dependency parse | ||||
|     |  and NER will be exported as separate files if the respective component | ||||
|     |  is present in the model's pipeline. | ||||
| 
 | ||||
| +code(false, "bash", "$", false, false, true). | ||||
|     spacy evaluate [model] [data_path] [--displacy-path] [--displacy-limit] [--gpu-id] [--gold-preproc] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code model] | ||||
|         +cell positional | ||||
|         +cell | ||||
|             |  Model to evaluate. Can be a package or shortcut link name, or a | ||||
|             |  path to a model data directory. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code data_path] | ||||
|         +cell positional | ||||
|         +cell Location of JSON-formatted evaluation data. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --displacy-path], #[code -dp] | ||||
|         +cell option | ||||
|         +cell | ||||
|             |  Directory to output rendered parses as HTML. If not set, no | ||||
|             |  visualizations will be generated. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --displacy-limit], #[code -dl] | ||||
|         +cell option | ||||
|         +cell | ||||
|             |  Number of parses to generate per file. Defaults to #[code 25]. | ||||
|             |  Keep in mind that a significantly higher number might cause the | ||||
|             |  #[code .html] files to render slowly. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --gpu-id], #[code -g] | ||||
|         +cell option | ||||
|         +cell GPU to use, if any. Defaults to #[code -1] for CPU. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --gold-preproc], #[code -G] | ||||
|         +cell flag | ||||
|         +cell Use gold preprocessing. | ||||
| 
 | ||||
| 
 | ||||
| +h(3, "package") Package | ||||
| 
 | ||||
| p | ||||
|     |  Generate a #[+a("/usage/training#models-generating") model Python package] | ||||
|     |  from an existing model data directory. All data files are copied over. | ||||
|     |  If the path to a meta.json is supplied, or a meta.json is found in the | ||||
|     |  input directory, this file is used. Otherwise, the data can be entered | ||||
|  | @ -336,8 +409,8 @@ p | |||
|     |  sure you're always using the latest versions. This means you need to be | ||||
|     |  connected to the internet to use this command. | ||||
| 
 | ||||
| +code(false, "bash", "$"). | ||||
|     spacy package [input_dir] [output_dir] [--meta] [--force] | ||||
| +code(false, "bash", "$", false, false, true). | ||||
|     spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -353,14 +426,14 @@ p | |||
|     +row | ||||
|         +cell #[code --meta-path], #[code -m] | ||||
|         +cell option | ||||
|         +cell Path to meta.json file (optional). | ||||
|         +cell #[+tag-new(2)] Path to meta.json file (optional). | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --create-meta], #[code -c] | ||||
|         +cell flag | ||||
|         +cell | ||||
|             |  Create a meta.json file on the command line, even if one already | ||||
|             |  exists in the directory. | ||||
|             |  #[+tag-new(2)] Create a meta.json file on the command line, even | ||||
|             |  if one already exists in the directory. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code --force], #[code -f] | ||||
							
								
								
									
										91
									
								
								website/api/_top-level/_compat.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								website/api/_top-level/_compat.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,91 @@ | |||
| //- 💫 DOCS > API > TOP-LEVEL > COMPATIBILITY | ||||
| 
 | ||||
| p | ||||
|     |  All Python code is written in an | ||||
|     |  #[strong intersection of Python 2 and Python 3]. This is easy in Cython, | ||||
|     |  but somewhat ugly in Python. Logic that deals with Python or platform | ||||
|     |  compatibility only lives in #[code spacy.compat]. To distinguish them from | ||||
|     |  the builtin functions, replacement functions are suffixed with an | ||||
|     |  undersocre, e.e #[code unicode_]. For specific checks, spaCy uses the | ||||
|     |  #[code six] and #[code ftfy] packages. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.compat import unicode_, json_dumps | ||||
| 
 | ||||
|     compatible_unicode = unicode_('hello world') | ||||
|     compatible_json = json_dumps({'key': 'value'}) | ||||
| 
 | ||||
| +table(["Name", "Python 2", "Python 3"]) | ||||
|     +row | ||||
|         +cell #[code compat.bytes_] | ||||
|         +cell #[code str] | ||||
|         +cell #[code bytes] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code compat.unicode_] | ||||
|         +cell #[code unicode] | ||||
|         +cell #[code str] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code compat.basestring_] | ||||
|         +cell #[code basestring] | ||||
|         +cell #[code str] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code compat.input_] | ||||
|         +cell #[code raw_input] | ||||
|         +cell #[code input] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code compat.json_dumps] | ||||
|         +cell #[code ujson.dumps] with #[code .decode('utf8')] | ||||
|         +cell #[code ujson.dumps] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code compat.path2str] | ||||
|         +cell #[code str(path)] with #[code .decode('utf8')] | ||||
|         +cell #[code str(path)] | ||||
| 
 | ||||
| +h(3, "is_config") compat.is_config | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|     |  Check if a specific configuration of Python version and operating system | ||||
|     |  matches the user's setup. Mostly used to display targeted error messages. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.compat import is_config | ||||
| 
 | ||||
|     if is_config(python2=True, windows=True): | ||||
|         print("You are using Python 2 on Windows.") | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code python2] | ||||
|         +cell bool | ||||
|         +cell spaCy is executed with Python 2.x. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code python3] | ||||
|         +cell bool | ||||
|         +cell spaCy is executed with Python 3.x. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code windows] | ||||
|         +cell bool | ||||
|         +cell spaCy is executed on Windows. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code linux] | ||||
|         +cell bool | ||||
|         +cell spaCy is executed on Linux. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code osx] | ||||
|         +cell bool | ||||
|         +cell spaCy is executed on OS X or macOS. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the specified configuration matches the user's platform. | ||||
|  | @ -1,14 +1,12 @@ | |||
| //- 💫 DOCS > API > DISPLACY | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| //- 💫 DOCS > API > TOP-LEVEL > DISPLACY | ||||
| 
 | ||||
| p | ||||
|     |  As of v2.0, spaCy comes with a built-in visualization suite. For more | ||||
|     |  info and examples, see the usage guide on | ||||
|     |  #[+a("/docs/usage/visualizers") visualizing spaCy]. | ||||
|     |  #[+a("/usage/visualizers") visualizing spaCy]. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "serve") displacy.serve | ||||
| +h(3, "displacy.serve") displacy.serve | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -60,7 +58,7 @@ p | |||
|         +cell bool | ||||
|         +cell | ||||
|             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||
|             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | ||||
|             |  dicts. #[+a("/usage/visualizers#manual-usage") See here] | ||||
|             |  for formats and examples. | ||||
|         +cell #[code False] | ||||
| 
 | ||||
|  | @ -70,7 +68,7 @@ p | |||
|         +cell Port to serve visualization. | ||||
|         +cell #[code 5000] | ||||
| 
 | ||||
| +h(2, "render") displacy.render | ||||
| +h(3, "displacy.render") displacy.render | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -127,24 +125,24 @@ p Render a dependency parse tree or named entity visualization. | |||
|         +cell bool | ||||
|         +cell | ||||
|             |  Don't parse #[code Doc] and instead, expect a dict or list of | ||||
|             |  dicts. #[+a("/docs/usage/visualizers#manual-usage") See here] | ||||
|             |  dicts. #[+a("/usage/visualizers#manual-usage") See here] | ||||
|             |  for formats and examples. | ||||
|         +cell #[code False] | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell unicode | ||||
|         +cell Rendered HTML markup. | ||||
|         +cell | ||||
| 
 | ||||
| +h(2, "options") Visualizer options | ||||
| +h(3, "displacy_options") Visualizer options | ||||
| 
 | ||||
| p | ||||
|     |  The #[code options] argument lets you specify additional settings for | ||||
|     |  each visualizer. If a setting is not present in the options, the default | ||||
|     |  value will be used. | ||||
| 
 | ||||
| +h(3, "options-dep") Dependency Visualizer options | ||||
| +h(4, "options-dep") Dependency Visualizer options | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     options = {'compact': True, 'color': 'blue'} | ||||
|  | @ -219,7 +217,7 @@ p | |||
|         +cell Distance between words in px. | ||||
|         +cell #[code 175] / #[code 85] (compact) | ||||
| 
 | ||||
| +h(3, "options-ent") Named Entity Visualizer options | ||||
| +h(4, "displacy_options-ent") Named Entity Visualizer options | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     options = {'ents': ['PERSON', 'ORG', 'PRODUCT'], | ||||
|  | @ -244,6 +242,6 @@ p | |||
| 
 | ||||
| p | ||||
|     |  By default, displaCy comes with colours for all | ||||
|     |  #[+a("/docs/api/annotation#named-entities") entity types supported by spaCy]. | ||||
|     |  #[+a("/api/annotation#named-entities") entity types supported by spaCy]. | ||||
|     |  If you're using custom entity types, you can use the #[code colors] | ||||
|     |  setting to add your own colours for them. | ||||
|  | @ -1,15 +1,13 @@ | |||
| //- 💫 DOCS > API > SPACY | ||||
| //- 💫 DOCS > API > TOP-LEVEL > SPACY | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| 
 | ||||
| +h(2, "load") spacy.load | ||||
| +h(3, "spacy.load") spacy.load | ||||
|     +tag function | ||||
|     +tag-model | ||||
| 
 | ||||
| p | ||||
|     |  Load a model via its #[+a("/docs/usage/models#usage") shortcut link], | ||||
|     |  Load a model via its #[+a("/usage/models#usage") shortcut link], | ||||
|     |  the name of an installed | ||||
|     |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode | ||||
|     |  #[+a("/usage/training#models-generating") model package], a unicode | ||||
|     |  path or a #[code Path]-like object. spaCy will try resolving the load | ||||
|     |  argument in this order. If a model is loaded from a shortcut link or | ||||
|     |  package name, spaCy will assume it's a Python package and import it and | ||||
|  | @ -38,25 +36,57 @@ p | |||
|         +cell list | ||||
|         +cell | ||||
|             |  Names of pipeline components to | ||||
|             |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. | ||||
|             |  #[+a("/usage/processing-pipelines#disabling") disable]. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell A #[code Language] object with the loaded model. | ||||
| 
 | ||||
| +infobox("⚠️ Deprecation note") | ||||
| +infobox("Deprecation note", "⚠️") | ||||
|     .o-block | ||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||
|         |  will also raise an error if no model could be loaded and never just | ||||
|         |  return an empty #[code Language] object. If you need a blank language, | ||||
|         |  you need to import it explicitly (#[code from spacy.lang.en import English]) | ||||
|         |  or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. | ||||
|         |  you can use the new function #[+api("spacy#blank") #[code spacy.blank()]] | ||||
|         |  or import the class explicitly, e.g. | ||||
|         |  #[code from spacy.lang.en import English]. | ||||
| 
 | ||||
|     +code-new nlp = spacy.load('/model') | ||||
|     +code-old nlp = spacy.load('en', path='/model') | ||||
| 
 | ||||
| +h(2, "info") spacy.info | ||||
| +h(3, "spacy.blank") spacy.blank | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Create a blank model of a given language class. This function is the | ||||
|     |  twin of #[code spacy.load()]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     nlp_en = spacy.blank('en') | ||||
|     nlp_de = spacy.blank('de') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell ISO code of the language class to load. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code disable] | ||||
|         +cell list | ||||
|         +cell | ||||
|             |  Names of pipeline components to | ||||
|             |  #[+a("/usage/processing-pipelines#disabling") disable]. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell An empty #[code Language] object of the appropriate subclass. | ||||
| 
 | ||||
| 
 | ||||
| +h(4, "spacy.info") spacy.info | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|  | @ -83,13 +113,13 @@ p | |||
|         +cell Print information as Markdown. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "explain") spacy.explain | ||||
| +h(3, "spacy.explain") spacy.explain | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|     |  Get a description for a given POS tag, dependency label or entity type. | ||||
|     |  For a list of available terms, see | ||||
|     |  #[+src(gh("spacy", "spacy/glossary.py")) glossary.py]. | ||||
|     |  #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     spacy.explain('NORP') | ||||
|  | @ -107,18 +137,18 @@ p | |||
|         +cell unicode | ||||
|         +cell Term to explain. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell unicode | ||||
|         +cell The explanation, or #[code None] if not found in the glossary. | ||||
| 
 | ||||
| +h(2, "set_factory") spacy.set_factory | ||||
| +h(3, "spacy.set_factory") spacy.set_factory | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Set a factory that returns a custom | ||||
|     |  #[+a("/docs/usage/language-processing-pipeline") processing pipeline] | ||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline] | ||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|  | @ -1,10 +1,8 @@ | |||
| //- 💫 DOCS > API > UTIL | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| //- 💫 DOCS > API > TOP-LEVEL > UTIL | ||||
| 
 | ||||
| p | ||||
|     |  spaCy comes with a small collection of utility functions located in | ||||
|     |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. | ||||
|     |  #[+src(gh("spaCy", "spacy/util.py")) #[code spacy/util.py]]. | ||||
|     |  Because utility functions are mostly intended for | ||||
|     |  #[strong internal use within spaCy], their behaviour may change with | ||||
|     |  future releases. The functions documented on this page should be safe | ||||
|  | @ -12,7 +10,7 @@ p | |||
|     |  recommend having additional tests in place if your application depends on | ||||
|     |  any of spaCy's utilities. | ||||
| 
 | ||||
| +h(2, "get_data_path") util.get_data_path | ||||
| +h(3, "util.get_data_path") util.get_data_path | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|  | @ -25,12 +23,12 @@ p | |||
|         +cell bool | ||||
|         +cell Only return path if it exists, otherwise return #[code None]. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Path] / #[code None] | ||||
|         +cell Data path or #[code None]. | ||||
| 
 | ||||
| +h(2, "set_data_path") util.set_data_path | ||||
| +h(3, "util.set_data_path") util.set_data_path | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|  | @ -47,12 +45,12 @@ p | |||
|         +cell unicode or #[code Path] | ||||
|         +cell Path to new data directory. | ||||
| 
 | ||||
| +h(2, "get_lang_class") util.get_lang_class | ||||
| +h(3, "util.get_lang_class") util.get_lang_class | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|     |  Import and load a #[code Language] class. Allows lazy-loading | ||||
|     |  #[+a("/docs/usage/adding-languages") language data] and importing | ||||
|     |  #[+a("/usage/adding-languages") language data] and importing | ||||
|     |  languages using the two-letter language code. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|  | @ -67,12 +65,12 @@ p | |||
|         +cell unicode | ||||
|         +cell Two-letter language code, e.g. #[code 'en']. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell Language class. | ||||
| 
 | ||||
| +h(2, "load_model") util.load_model | ||||
| +h(3, "util.load_model") util.load_model | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -101,12 +99,12 @@ p | |||
|         +cell - | ||||
|         +cell Specific overrides, like pipeline components to disable. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell #[code Language] class with the loaded model. | ||||
| 
 | ||||
| +h(2, "load_model_from_path") util.load_model_from_path | ||||
| +h(3, "util.load_model_from_path") util.load_model_from_path | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -139,18 +137,18 @@ p | |||
|         +cell - | ||||
|         +cell Specific overrides, like pipeline components to disable. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell #[code Language] class with the loaded model. | ||||
| 
 | ||||
| +h(2, "load_model_from_init_py") util.load_model_from_init_py | ||||
| +h(3, "util.load_model_from_init_py") util.load_model_from_init_py | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  A helper function to use in the #[code load()] method of a model package's | ||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. | ||||
|     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) #[code __init__.py]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.util import load_model_from_init_py | ||||
|  | @ -169,12 +167,12 @@ p | |||
|         +cell - | ||||
|         +cell Specific overrides, like pipeline components to disable. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Language] | ||||
|         +cell #[code Language] class with the loaded model. | ||||
| 
 | ||||
| +h(2, "get_model_meta") util.get_model_meta | ||||
| +h(3, "util.get_model_meta") util.get_model_meta | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -190,17 +188,17 @@ p | |||
|         +cell unicode or #[code Path] | ||||
|         +cell Path to model directory. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell dict | ||||
|         +cell The model's meta data. | ||||
| 
 | ||||
| +h(2, "is_package") util.is_package | ||||
| +h(3, "util.is_package") util.is_package | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|     |  Check if string maps to a package installed via pip. Mainly used to | ||||
|     |  validate #[+a("/docs/usage/models") model packages]. | ||||
|     |  validate #[+a("/usage/models") model packages]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     util.is_package('en_core_web_sm') # True | ||||
|  | @ -212,18 +210,18 @@ p | |||
|         +cell unicode | ||||
|         +cell Name of package. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code bool] | ||||
|         +cell #[code True] if installed package, #[code False] if not. | ||||
| 
 | ||||
| +h(2, "get_package_path") util.get_package_path | ||||
| +h(3, "util.get_package_path") util.get_package_path | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Get path to an installed package. Mainly used to resolve the location of | ||||
|     |  #[+a("/docs/usage/models") model packages]. Currently imports the package | ||||
|     |  #[+a("/usage/models") model packages]. Currently imports the package | ||||
|     |  to find its path. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|  | @ -236,12 +234,12 @@ p | |||
|         +cell unicode | ||||
|         +cell Name of installed package. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Path] | ||||
|         +cell Path to model package directory. | ||||
| 
 | ||||
| +h(2, "is_in_jupyter") util.is_in_jupyter | ||||
| +h(3, "util.is_in_jupyter") util.is_in_jupyter | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
|  | @ -257,17 +255,17 @@ p | |||
|         return display(HTML(html)) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell #[code True] if in Jupyter, #[code False] if not. | ||||
| 
 | ||||
| +h(2, "update_exc") util.update_exc | ||||
| +h(3, "util.update_exc") util.update_exc | ||||
|     +tag function | ||||
| 
 | ||||
| p | ||||
|     |  Update, validate and overwrite | ||||
|     |  #[+a("/docs/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. | ||||
|     |  #[+a("/usage/adding-languages#tokenizer-exceptions") tokenizer exceptions]. | ||||
|     |  Used to combine global  exceptions with custom, language-specific | ||||
|     |  exceptions. Will raise an error if key doesn't match #[code ORTH] values. | ||||
| 
 | ||||
|  | @ -288,20 +286,20 @@ p | |||
|         +cell dicts | ||||
|         +cell Exception dictionaries to add to the base exceptions, in order. | ||||
| 
 | ||||
|     +footrow | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell dict | ||||
|         +cell Combined tokenizer exceptions. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "prints") util.prints | ||||
| +h(3, "util.prints") util.prints | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Print a formatted, text-wrapped message with optional title. If a text | ||||
|     |  argument is a #[code Path], it's converted to a string. Should only | ||||
|     |  be used for interactive components like the #[+api("cli") cli]. | ||||
|     |  be used for interactive components like the command-line interface. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     data_path = Path('/some/path') | ||||
							
								
								
									
										131
									
								
								website/api/annotation.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								website/api/annotation.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,131 @@ | |||
| //- 💫 DOCS > API > ANNOTATION SPECS | ||||
| 
 | ||||
| include ../_includes/_mixins | ||||
| 
 | ||||
| p This document describes the target annotations spaCy is trained to predict. | ||||
| 
 | ||||
| 
 | ||||
| +section("tokenization") | ||||
|     +h(2, "tokenization") Tokenization | ||||
| 
 | ||||
|     p | ||||
|         |  Tokenization standards are based on the | ||||
|         |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus. | ||||
|         |  The tokenizer differs from most by including tokens for significant | ||||
|         |  whitespace. Any sequence of whitespace characters beyond a single space | ||||
|         |  (#[code ' ']) is included as a token. | ||||
| 
 | ||||
|     +aside-code("Example"). | ||||
|         from spacy.lang.en import English | ||||
|         nlp = English() | ||||
|         tokens = nlp('Some\nspaces  and\ttab characters') | ||||
|         tokens_text = [t.text for t in tokens] | ||||
|         assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', | ||||
|                             '\t', 'tab', 'characters'] | ||||
| 
 | ||||
|     p | ||||
|         |  The whitespace tokens are useful for much the same reason punctuation is | ||||
|         |  – it's often an important delimiter in the text. By preserving it in the | ||||
|         |  token output, we are able to maintain a simple alignment between the | ||||
|         |  tokens and the original string, and we ensure that no information is | ||||
|         |  lost during processing. | ||||
| 
 | ||||
| +section("sbd") | ||||
|     +h(2, "sentence-boundary") Sentence boundary detection | ||||
| 
 | ||||
|     p | ||||
|         |  Sentence boundaries are calculated from the syntactic parse tree, so | ||||
|         |  features such as punctuation and capitalisation play an important but | ||||
|         |  non-decisive role in determining the sentence boundaries. Usually this | ||||
|         |  means that the sentence boundaries will at least coincide with clause | ||||
|         |  boundaries, even given poorly punctuated text. | ||||
| 
 | ||||
| +section("pos-tagging") | ||||
|     +h(2, "pos-tagging") Part-of-speech Tagging | ||||
| 
 | ||||
|     +aside("Tip: Understanding tags") | ||||
|         |  You can also use #[code spacy.explain()] to get the description for the | ||||
|         |  string representation of a tag. For example, | ||||
|         |  #[code spacy.explain("RB")] will return "adverb". | ||||
| 
 | ||||
|     include _annotation/_pos-tags | ||||
| 
 | ||||
| +section("lemmatization") | ||||
|     +h(2, "lemmatization") Lemmatization | ||||
| 
 | ||||
|     p A "lemma" is the uninflected form of a word. In English, this means: | ||||
| 
 | ||||
|     +list | ||||
|         +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest" | ||||
|         +item #[strong Adverbs]: The form like "badly", not "worse" or "worst" | ||||
|         +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" | ||||
|         +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" | ||||
| 
 | ||||
|     p | ||||
|         |  The lemmatization data is taken from | ||||
|         |  #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a | ||||
|         |  special case for pronouns: all pronouns are lemmatized to the special | ||||
|         |  token #[code -PRON-]. | ||||
| 
 | ||||
|     +infobox("About spaCy's custom pronoun lemma") | ||||
|         |  Unlike verbs and common nouns, there's no clear base form of a personal | ||||
|         |  pronoun. Should the lemma of "me" be "I", or should we normalize person | ||||
|         |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a | ||||
|         |  novel symbol, #[code -PRON-], which is used as the lemma for | ||||
|         |  all personal pronouns. | ||||
| 
 | ||||
| +section("dependency-parsing") | ||||
|     +h(2, "dependency-parsing") Syntactic Dependency Parsing | ||||
| 
 | ||||
|     +aside("Tip: Understanding labels") | ||||
|         |  You can also use #[code spacy.explain()] to get the description for the | ||||
|         |  string representation of a label. For example, | ||||
|         |  #[code spacy.explain("prt")] will return "particle". | ||||
| 
 | ||||
|     include _annotation/_dep-labels | ||||
| 
 | ||||
| +section("named-entities") | ||||
|     +h(2, "named-entities") Named Entity Recognition | ||||
| 
 | ||||
|     +aside("Tip: Understanding entity types") | ||||
|         |  You can also use #[code spacy.explain()] to get the description for the | ||||
|         |  string representation of an entity label. For example, | ||||
|         |  #[code spacy.explain("LANGUAGE")] will return "any named language". | ||||
| 
 | ||||
|     include _annotation/_named-entities | ||||
| 
 | ||||
|     +h(3, "biluo") BILUO Scheme | ||||
| 
 | ||||
|     include _annotation/_biluo | ||||
| 
 | ||||
| +section("training") | ||||
|     +h(2, "json-input") JSON input format for training | ||||
| 
 | ||||
|     +under-construction | ||||
| 
 | ||||
|     p spaCy takes training data in the following format: | ||||
| 
 | ||||
|     +code("Example structure"). | ||||
|         doc: { | ||||
|             id: string, | ||||
|             paragraphs: [{ | ||||
|                 raw: string, | ||||
|                 sents: [int], | ||||
|                 tokens: [{ | ||||
|                     start: int, | ||||
|                     tag: string, | ||||
|                     head: int, | ||||
|                     dep: string | ||||
|                 }], | ||||
|                 ner: [{ | ||||
|                     start: int, | ||||
|                     end: int, | ||||
|                     label: string | ||||
|                 }], | ||||
|                 brackets: [{ | ||||
|                     start: int, | ||||
|                     end: int, | ||||
|                     label: string | ||||
|                 }] | ||||
|             }] | ||||
|         } | ||||
|  | @ -1,6 +1,6 @@ | |||
| //- 💫 DOCS > API > BINDER | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| include ../_includes/_mixins | ||||
| 
 | ||||
| p A container class for serializing collections of #[code Doc] objects. | ||||
| 
 | ||||
							
								
								
									
										5
									
								
								website/api/dependencyparser.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								website/api/dependencyparser.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | |||
| //- 💫 DOCS > API > DEPENDENCYPARSER | ||||
| 
 | ||||
| include ../_includes/_mixins | ||||
| 
 | ||||
| !=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" }) | ||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user