From e2ed2f02582da9d19d39ada8a3ea819b9b6f4546 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:51:21 +0200 Subject: [PATCH 1/5] Bump version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 82e82093e..bb0489e64 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -55,7 +55,7 @@ } }, - "V_CSS": "1.2", + "V_CSS": "1.3", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", From 7ceaa1614b8eb140c05a4e100165af195604451f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:51:40 +0200 Subject: [PATCH 2/5] Add experimental model init command --- spacy/__main__.py | 17 +++++- spacy/cli/__init__.py | 1 + spacy/cli/model.py | 129 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 spacy/cli/model.py diff --git a/spacy/__main__.py b/spacy/__main__.py index 7ec3f535a..a805c984d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -9,12 +9,13 @@ from spacy.cli import link as cli_link from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train +from spacy.cli import model as cli_model class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info', 'package', 'train') + commands = ('download', 'link', 'info', 'package', 'train', 'model') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -95,6 +96,20 @@ class CLI(object): cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, not no_parser, not no_ner, parser_L1) + @plac.annotations( + lang=("model language", "positional", None, str), + model_dir=("output directory to store model in", "positional", None, str), + freqs_data=("tab-separated frequencies file", "positional", None, str), + clusters_data=("Brown clusters file", "positional", None, str), + vectors_data=("word vectors file", "positional", None, str) + ) + def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None): + """ + Initialize a new model and its data directory. + """ + + cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) + def __missing__(self, name): print("\n Command %r does not exist." diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index a4bc57ea9..b97279dec 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -3,3 +3,4 @@ from .info import info from .link import link from .package import package from .train import train, train_config +from .model import model diff --git a/spacy/cli/model.py b/spacy/cli/model.py new file mode 100644 index 000000000..350023d5a --- /dev/null +++ b/spacy/cli/model.py @@ -0,0 +1,129 @@ +# coding: utf8 +from __future__ import unicode_literals + +import gzip +import math +from ast import literal_eval +from pathlib import Path +from preshed.counter import PreshCounter + +from ..vocab import Vocab, write_binary_vectors +from .. import util + + +def model(lang, model_dir, freqs_data, clusters_data, vectors_data): + model_path = Path(model_dir) + freqs_path = Path(freqs_data) + clusters_path = Path(clusters_data) if clusters_data else None + vectors_path = Path(vectors_data) if vectors_data else None + + check_dirs(freqs_path, clusters_path, vectors_path) + vocab = util.get_lang_class(lang).Defaults.create_vocab() + probs, oov_prob = read_probs(freqs_path) + clusters = read_clusters(clusters_path) if clusters_path else {} + populate_vocab(vocab, clusters, probs, oov_prob) + create_model(model_path, vectors_path, vocab, oov_prob) + + +def create_model(model_path, vectors_path, vocab, oov_prob): + vocab_path = model_path / 'vocab' + lexemes_path = vocab_path / 'lexemes.bin' + strings_path = vocab_path / 'strings.json' + oov_path = vocab_path / 'oov_prob' + + if not model_path.exists(): + model_path.mkdir() + if not vocab_path.exists(): + vocab_path.mkdir() + vocab.dump(lexemes_path.as_posix()) + with strings_path.open('w') as f: + vocab.strings.dump(f) + with oov_path.open('w') as f: + f.write('%f' % oov_prob) + if vectors_path: + vectors_dest = model_path / 'vec.bin' + write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) + + +def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): + counts = PreshCounter() + total = 0 + freqs_file = check_unzip(freqs_path) + for i, line in enumerate(freqs_file): + freq, doc_freq, key = line.rstrip().split('\t', 2) + freq = int(freq) + counts.inc(i+1, freq) + total += freq + counts.smooth() + log_total = math.log(total) + freqs_file = check_unzip(freqs_path) + probs = {} + for line in freqs_file: + freq, doc_freq, key = line.rstrip().split('\t', 2) + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: + word = literal_eval(key) + smooth_count = counts.smoother(int(freq)) + probs[word] = math.log(smooth_count) - log_total + oov_prob = math.log(counts.smoother(0)) - log_total + return probs, oov_prob + + +def read_clusters(clusters_path): + clusters = {} + with clusters_path.open() as f: + for line in f: + try: + cluster, word, freq = line.split() + except ValueError: + continue + # If the clusterer has only seen the word a few times, its + # cluster is unreliable. + if int(freq) >= 3: + clusters[word] = cluster + else: + clusters[word] = '0' + # Expand clusters with re-casing + for word, cluster in list(clusters.items()): + if word.lower() not in clusters: + clusters[word.lower()] = cluster + if word.title() not in clusters: + clusters[word.title()] = cluster + if word.upper() not in clusters: + clusters[word.upper()] = cluster + return clusters + + +def populate_vocab(vocab, clusters, probs, oov_probs): + # Ensure probs has entries for all words seen during clustering. + for word in clusters: + if word not in probs: + probs[word] = oov_prob + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See _parse_features.pyx + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 + + +def check_unzip(file_path): + file_path_str = file_path.as_posix() + if file_path_str.endswith('gz'): + return gzip.open(file_path_str) + else: + return file_path.open() + + +def check_dirs(freqs_data, clusters_data, vectors_data): + if not freqs_data.is_file(): + util.sys_exit(freqs_data.as_posix(), title="No frequencies file found") + if clusters_data and not clusters_data.is_file(): + util.sys_exit(clusters_data.as_posix(), title="No Brown clusters file found") + if vectors_data and not vectors_data.is_file(): + util.sys_exit(vectors_data.as_posix(), title="No word vectors file found") From 7198cf1c8aa9d3d63772b263ec0cce1cbca15083 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:56:05 +0200 Subject: [PATCH 3/5] Remove unused import --- spacy/cli/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 350023d5a..4cfd9a6f6 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -7,7 +7,7 @@ from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter -from ..vocab import Vocab, write_binary_vectors +from ..vocab import write_binary_vectors from .. import util From d4a59c254bd30a83748586a85850e0cd351b20d9 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:01 +0200 Subject: [PATCH 4/5] Remove whitespace --- spacy/lexeme.pyx | 30 +++++++++++++++--------------- spacy/tokens/span.pyx | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1d5421d74..84338e281 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -87,7 +87,7 @@ cdef class Lexeme: value (bool): The new value of the flag. """ Lexeme.c_set_flag(self.c, flag_id, value) - + def check_flag(self, attr_id_t flag_id): """Check the value of a boolean flag. @@ -137,7 +137,7 @@ cdef class Lexeme: "\npython -m spacy.%s.download all\n" "to install the data." % self.vocab.lang ) - + vector_view = self.c.vector return numpy.asarray(vector_view) @@ -163,7 +163,7 @@ cdef class Lexeme: return self.c.sentiment def __set__(self, float sentiment): self.c.sentiment = sentiment - + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] @@ -171,7 +171,7 @@ cdef class Lexeme: property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x - + property norm: def __get__(self): return self.c.norm def __set__(self, int x): self.c.norm = x @@ -187,11 +187,11 @@ cdef class Lexeme: property suffix: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - + property cluster: def __get__(self): return self.c.cluster def __set__(self, int x): self.c.cluster = x - + property lang: def __get__(self): return self.c.lang def __set__(self, int x): self.c.lang = x @@ -203,11 +203,11 @@ cdef class Lexeme: property lower_: def __get__(self): return self.vocab.strings[self.c.lower] def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] - + property norm_: def __get__(self): return self.vocab.strings[self.c.norm] def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] - + property shape_: def __get__(self): return self.vocab.strings[self.c.shape] def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] @@ -239,7 +239,7 @@ cdef class Lexeme: property is_alpha: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) - + property is_ascii: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) @@ -260,23 +260,23 @@ cdef class Lexeme: def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + property is_space: def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + property is_bracket: def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + property is_quote: def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + property is_left_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + property is_right_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) @@ -284,7 +284,7 @@ cdef class Lexeme: property like_url: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) - + property like_num: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index fc5d26174..37d99183c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -128,13 +128,13 @@ cdef class Span: end = token_by_end(self.doc.c, self.doc.length, self.end_char) if end == -1: raise IndexError("Error calculating span: Can't find end") - + self.start = start self.end = end + 1 property sent: '''The sentence span that this span is a part of. - + Returns: Span The sentence this is part of. ''' @@ -157,7 +157,7 @@ cdef class Span: if 'has_vector' in self.doc.user_span_hooks: return self.doc.user_span_hooks['has_vector'](self) return any(token.has_vector for token in self) - + property vector: def __get__(self): if 'vector' in self.doc.user_span_hooks: @@ -200,9 +200,9 @@ cdef class Span: property noun_chunks: ''' Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: ''' def __get__(self): @@ -227,19 +227,19 @@ cdef class Span: Returns: Token: The root token. - + i.e. has the shortest path to the root of the sentence (or is the root itself). If multiple words are equally high in the tree, the first word is taken. - + For example: - + >>> toks = nlp(u'I like New York in Autumn.') Let's name the indices --- easier than writing "toks[4]" etc. - >>> i, like, new, york, in_, autumn, dot = range(len(toks)) + >>> i, like, new, york, in_, autumn, dot = range(len(toks)) The head of 'new' is 'York', and the head of 'York' is 'like' @@ -301,10 +301,10 @@ cdef class Span: return self.doc[self.start] else: return self.doc[root] - + property lefts: """Tokens that are to the left of the span, whose head is within the Span. - + Yields: Token A left-child of a token of the span. """ def __get__(self): @@ -315,7 +315,7 @@ cdef class Span: property rights: """Tokens that are to the right of the Span, whose head is within the Span. - + Yields: Token A right-child of a token of the span. """ def __get__(self): From 42382d56926ac545b47dfb5e9abcdbb4b7e469f8 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:32 +0200 Subject: [PATCH 5/5] Fix download commands in error messages (see #946) --- spacy/lexeme.pyx | 2 +- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pyx | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 84338e281..3a26161bb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -134,7 +134,7 @@ cdef class Lexeme: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index bda528383..2e1481d1b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -394,7 +394,7 @@ cdef class Doc: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. @@ -427,7 +427,7 @@ cdef class Doc: raise ValueError( "sentence boundary detection requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) cdef int i start = 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 37d99183c..f43d47876 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -210,7 +210,7 @@ cdef class Span: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 34de9dee7..b8e470437 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -236,7 +236,7 @@ cdef class Token: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) vector_view = self.c.lex.vector