diff --git a/README.rst b/README.rst index e5e2dcc77..244308473 100644 --- a/README.rst +++ b/README.rst @@ -229,7 +229,7 @@ Compile from source The other way to install spaCy is to clone its `GitHub repository `_ and build it from source. That is the common way if you want to make changes to the code base. -You'll need to make sure that you have a development enviroment consisting of a +You'll need to make sure that you have a development environment consisting of a Python distribution including header files, a compiler, `pip `__, `virtualenv `_ and `git `_ installed. The compiler part is the trickiest. diff --git a/spacy/__main__.py b/spacy/__main__.py index 214a7b617..2b15e4374 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -3,15 +3,21 @@ from __future__ import print_function # NB! This breaks in plac on Python 2!! #from __future__ import unicode_literals - if __name__ == '__main__': import plac import sys - from spacy.cli import download, link, info, package, train, convert + from spacy.cli import download, link, info, package, train, convert, model from spacy.util import prints - commands = {'download': download, 'link': link, 'info': info, 'train': train, - 'convert': convert, 'package': package} + commands = { + 'download': download, + 'link': link, + 'info': info, + 'train': train, + 'convert': convert, + 'package': package, + 'model': model + } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) command = sys.argv.pop(1) @@ -19,5 +25,7 @@ if __name__ == '__main__': if command in commands: plac.call(commands[command]) else: - prints("Available: %s" % ', '.join(commands), - title="Unknown command: %s" % command, exits=1) + prints( + "Available: %s" % ', '.join(commands), + title="Unknown command: %s" % command, + exits=1) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2b4f98a88..480b27a23 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,3 +4,4 @@ from .link import link from .package import package from .train import train from .convert import convert +from .model import model diff --git a/spacy/cli/model.py b/spacy/cli/model.py new file mode 100644 index 000000000..5c3b0cb80 --- /dev/null +++ b/spacy/cli/model.py @@ -0,0 +1,119 @@ +# coding: utf8 +from __future__ import unicode_literals + +import gzip +import math +from ast import literal_eval +from pathlib import Path +from preshed.counter import PreshCounter + +import spacy +from ..compat import fix_text +from .. import util + + +def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data): + model_path = Path(model_dir) + freqs_path = Path(freqs_data) + clusters_path = Path(clusters_data) if clusters_data else None + vectors_path = Path(vectors_data) if vectors_data else None + + check_dirs(freqs_path, clusters_path, vectors_path) + # vocab = util.get_lang_class(lang).Defaults.create_vocab() + nlp = spacy.blank(lang) + vocab = nlp.vocab + probs, oov_prob = read_probs(freqs_path) + clusters = read_clusters(clusters_path) if clusters_path else {} + populate_vocab(vocab, clusters, probs, oov_prob) + create_model(model_path, nlp) + + +def create_model(model_path, model): + if not model_path.exists(): + model_path.mkdir() + model.to_disk(model_path.as_posix()) + + +def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): + counts = PreshCounter() + total = 0 + freqs_file = check_unzip(freqs_path) + for i, line in enumerate(freqs_file): + freq, doc_freq, key = line.rstrip().split('\t', 2) + freq = int(freq) + counts.inc(i + 1, freq) + total += freq + counts.smooth() + log_total = math.log(total) + freqs_file = check_unzip(freqs_path) + probs = {} + for line in freqs_file: + freq, doc_freq, key = line.rstrip().split('\t', 2) + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len( + key) < max_length: + word = literal_eval(key) + smooth_count = counts.smoother(int(freq)) + probs[word] = math.log(smooth_count) - log_total + oov_prob = math.log(counts.smoother(0)) - log_total + return probs, oov_prob + + +def read_clusters(clusters_path): + clusters = {} + with clusters_path.open() as f: + for line in f: + try: + cluster, word, freq = line.split() + word = fix_text(word) + except ValueError: + continue + # If the clusterer has only seen the word a few times, its + # cluster is unreliable. + if int(freq) >= 3: + clusters[word] = cluster + else: + clusters[word] = '0' + # Expand clusters with re-casing + for word, cluster in list(clusters.items()): + if word.lower() not in clusters: + clusters[word.lower()] = cluster + if word.title() not in clusters: + clusters[word.title()] = cluster + if word.upper() not in clusters: + clusters[word.upper()] = cluster + return clusters + + +def populate_vocab(vocab, clusters, probs, oov_prob): + for word, prob in reversed( + sorted(list(probs.items()), key=lambda item: item[1])): + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See _parse_features.pyx + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 + + +def check_unzip(file_path): + file_path_str = file_path.as_posix() + if file_path_str.endswith('gz'): + return gzip.open(file_path_str) + else: + return file_path.open() + + +def check_dirs(freqs_data, clusters_data, vectors_data): + if not freqs_data.is_file(): + util.sys_exit(freqs_data.as_posix(), title="No frequencies file found") + if clusters_data and not clusters_data.is_file(): + util.sys_exit( + clusters_data.as_posix(), title="No Brown clusters file found") + if vectors_data and not vectors_data.is_file(): + util.sys_exit( + vectors_data.as_posix(), title="No word vectors file found") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 39951447c..096f265a9 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -406,11 +406,11 @@ cdef class GoldParse: if tags is None: tags = [None for _ in doc] if heads is None: - heads = [token.i for token in doc] + heads = [None for token in doc] if deps is None: deps = [None for _ in doc] if entities is None: - entities = ['-' for _ in doc] + entities = [None for _ in doc] elif len(entities) == 0: entities = ['O' for _ in doc] elif not isinstance(entities[0], basestring): diff --git a/spacy/language.py b/spacy/language.py index ed880d9ca..50ed0a166 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -200,6 +200,7 @@ class Language(object): else: flat_list.append(pipe) self.pipeline = flat_list + self._optimizer = None @property def meta(self): @@ -278,7 +279,7 @@ class Language(object): return self.tokenizer(text) def update(self, docs, golds, drop=0., sgd=None, losses=None, - update_tensors=False): + update_shared=False): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -298,6 +299,10 @@ class Language(object): "Got: %d, %d" % (len(docs), len(golds))) if len(docs) == 0: return + if sgd is None: + if self._optimizer is None: + self._optimizer = Adam(Model.ops, 0.001) + sgd = self._optimizer tok2vec = self.pipeline[0] feats = tok2vec.doc2feats(docs) grads = {} @@ -312,12 +317,13 @@ class Language(object): continue d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - if update_tensors and d_tokvecses is not None: + if update_shared and d_tokvecses is not None: for i, d_tv in enumerate(d_tokvecses): all_d_tokvecses[i] += d_tv - bp_tokvecses(all_d_tokvecses, sgd=sgd) - for key, (W, dW) in grads.items(): - sgd(W, dW, key=key) + if update_shared and bp_tokvecses is not None: + bp_tokvecses(all_d_tokvecses, sgd=sgd) + for key, (W, dW) in grads.items(): + sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. # If we don't do this, the memory leak gets pretty # bad, because we may be holding part of a batch. @@ -378,11 +384,11 @@ class Language(object): eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) - optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, - beta2=beta2, eps=eps) - optimizer.max_grad_norm = max_grad_norm - optimizer.device = device - return optimizer + self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) + self._optimizer.max_grad_norm = max_grad_norm + self._optimizer.device = device + return self._optimizer def evaluate(self, docs_golds): scorer = Scorer() diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 634d3e4b5..8c3759778 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -294,6 +294,8 @@ class NeuralTagger(BaseThincComponent): doc.is_tagged = True def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if losses is not None and self.name not in losses: + losses[self.name] = 0. docs, tokvecs = docs_tokvecs if self.model.nI is None: @@ -302,6 +304,8 @@ class NeuralTagger(BaseThincComponent): loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) + if losses is not None: + losses[self.name] += loss return d_tokvecs def get_loss(self, docs, golds, scores): diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index d15de0181..2f5cd4e48 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -113,7 +113,7 @@ cdef class BiluoPushDown(TransitionSystem): def has_gold(self, GoldParse gold, start=0, end=None): end = end or len(gold.ner) - if all([tag == '-' for tag in gold.ner[start:end]]): + if all([tag in ('-', None) for tag in gold.ner[start:end]]): return False else: return True diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 7412ebeee..f1a0bc91c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -483,6 +483,9 @@ cdef class Parser: return beams def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds) + if not golds: + return None if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: return self.update_beam(docs_tokvecs, golds, self.cfg['beam_width'], self.cfg['beam_density'], @@ -555,6 +558,9 @@ cdef class Parser: def update_beam(self, docs_tokvecs, golds, width=None, density=None, drop=0., sgd=None, losses=None): + docs_tokvecs, golds = self._filter_unlabelled(docs_tokvecs, golds) + if not golds: + return None if width is None: width = self.cfg.get('beam_width', 2) if density is None: @@ -605,6 +611,15 @@ cdef class Parser: bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs + def _filter_unlabelled(self, docs_tokvecs, golds): + '''Remove inputs that have no relevant labels before update''' + has_golds = [self.moves.has_gold(gold) for gold in golds] + docs, tokvecs = docs_tokvecs + docs = [docs[i] for i, has_gold in enumerate(has_golds) if has_gold] + tokvecs = [tokvecs[i] for i, has_gold in enumerate(has_golds) if has_gold] + golds = [golds[i] for i, has_gold in enumerate(has_golds) if has_gold] + return (docs, tokvecs), golds + def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 2ca2d3ea9..72821ab04 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token. p | The token within the span that's highest in the parse tree. If there's a - | tie, the earlist is prefered. + | tie, the earliest is preferred. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index f56ce9fb1..7e0b4b479 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -39,7 +39,7 @@ p +h(2, "special-cases") Adding special case tokenization rules p - | Most domains have at least some idiosyncracies that require custom + | Most domains have at least some idiosyncrasies that require custom | tokenization rules. This could be very certain expressions, or | abbreviations only used in this specific field. diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 60bc3cd7b..a0aa1dca8 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -109,7 +109,7 @@ p | The other way to install spaCy is to clone its | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | the common way if you want to make changes to the code base. You'll need to - | make sure that you have a development enviroment consisting of a Python + | make sure that you have a development environment consisting of a Python | distribution including header files, a compiler, | #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://virtualenv.pypa.io/") virtualenv] and diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index dd72efeba..effc185e9 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -40,7 +40,7 @@ p +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +row - +cell I read the paper yesteday + +cell I read the paper yesterday +cell read +cell read +cell verb diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index 70227e648..d4a1ffbc2 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -94,7 +94,7 @@ p | is mostly intended as a convenient, interactive wrapper. It performs | compatibility checks and prints detailed error messages and warnings. | However, if you're downloading models as part of an automated build - | process, this only adds an unecessary layer of complexity. If you know + | process, this only adds an unnecessary layer of complexity. If you know | which models your application needs, you should be specifying them directly. p