From e2ed2f02582da9d19d39ada8a3ea819b9b6f4546 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 26 Mar 2017 20:51:21 +0200
Subject: [PATCH 1/5] Bump version

---
 website/_harp.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/_harp.json b/website/_harp.json
index 82e82093e..bb0489e64 100644
--- a/website/_harp.json
+++ b/website/_harp.json
@@ -55,7 +55,7 @@
             }
         },
 
-        "V_CSS": "1.2",
+        "V_CSS": "1.3",
         "V_JS": "1.2",
         "DEFAULT_SYNTAX": "python",
         "ANALYTICS": "UA-58931649-1",

From 7ceaa1614b8eb140c05a4e100165af195604451f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 26 Mar 2017 20:51:40 +0200
Subject: [PATCH 2/5] Add experimental model init command

---
 spacy/__main__.py     |  17 +++++-
 spacy/cli/__init__.py |   1 +
 spacy/cli/model.py    | 129 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 spacy/cli/model.py

diff --git a/spacy/__main__.py b/spacy/__main__.py
index 7ec3f535a..a805c984d 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -9,12 +9,13 @@ from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
+from spacy.cli import model as cli_model
 
 
 class CLI(object):
     """Command-line interface for spaCy"""
 
-    commands = ('download', 'link', 'info', 'package', 'train')
+    commands = ('download', 'link', 'info', 'package', 'train', 'model')
 
     @plac.annotations(
         model=("model to download (shortcut or model name)", "positional", None, str),
@@ -95,6 +96,20 @@ class CLI(object):
         cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
                   not no_parser, not no_ner, parser_L1)
 
+    @plac.annotations(
+        lang=("model language", "positional", None, str),
+        model_dir=("output directory to store model in", "positional", None, str),
+        freqs_data=("tab-separated frequencies file", "positional", None, str),
+        clusters_data=("Brown clusters file", "positional", None, str),
+        vectors_data=("word vectors file", "positional", None, str)
+    )
+    def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None):
+        """
+        Initialize a new model and its data directory.
+        """
+
+        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
+
 
     def __missing__(self, name):
         print("\n   Command %r does not exist."
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index a4bc57ea9..b97279dec 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -3,3 +3,4 @@ from .info import info
 from .link import link
 from .package import package
 from .train import train, train_config
+from .model import model
diff --git a/spacy/cli/model.py b/spacy/cli/model.py
new file mode 100644
index 000000000..350023d5a
--- /dev/null
+++ b/spacy/cli/model.py
@@ -0,0 +1,129 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import gzip
+import math
+from ast import literal_eval
+from pathlib import Path
+from preshed.counter import PreshCounter
+
+from ..vocab import Vocab, write_binary_vectors
+from .. import util
+
+
+def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
+    model_path = Path(model_dir)
+    freqs_path = Path(freqs_data)
+    clusters_path = Path(clusters_data) if clusters_data else None
+    vectors_path = Path(vectors_data) if vectors_data else None
+
+    check_dirs(freqs_path, clusters_path, vectors_path)
+    vocab = util.get_lang_class(lang).Defaults.create_vocab()
+    probs, oov_prob = read_probs(freqs_path)
+    clusters = read_clusters(clusters_path) if clusters_path else {}
+    populate_vocab(vocab, clusters, probs, oov_prob)
+    create_model(model_path, vectors_path, vocab, oov_prob)
+
+
+def create_model(model_path, vectors_path, vocab, oov_prob):
+    vocab_path = model_path / 'vocab'
+    lexemes_path = vocab_path / 'lexemes.bin'
+    strings_path = vocab_path / 'strings.json'
+    oov_path = vocab_path / 'oov_prob'
+
+    if not model_path.exists():
+        model_path.mkdir()
+    if not vocab_path.exists():
+        vocab_path.mkdir()
+    vocab.dump(lexemes_path.as_posix())
+    with strings_path.open('w') as f:
+        vocab.strings.dump(f)
+    with oov_path.open('w') as f:
+        f.write('%f' % oov_prob)
+    if vectors_path:
+        vectors_dest = model_path / 'vec.bin'
+        write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix())
+
+
+def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
+    counts = PreshCounter()
+    total = 0
+    freqs_file = check_unzip(freqs_path)
+    for i, line in enumerate(freqs_file):
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        freq = int(freq)
+        counts.inc(i+1, freq)
+        total += freq
+    counts.smooth()
+    log_total = math.log(total)
+    freqs_file = check_unzip(freqs_path)
+    probs = {}
+    for line in freqs_file:
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
+        doc_freq = int(doc_freq)
+        freq = int(freq)
+        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
+            word = literal_eval(key)
+            smooth_count = counts.smoother(int(freq))
+            probs[word] = math.log(smooth_count) - log_total
+    oov_prob = math.log(counts.smoother(0)) - log_total
+    return probs, oov_prob
+
+
+def read_clusters(clusters_path):
+    clusters = {}
+    with clusters_path.open() as f:
+        for line in f:
+            try:
+                cluster, word, freq = line.split()
+            except ValueError:
+                continue
+            # If the clusterer has only seen the word a few times, its
+            # cluster is unreliable.
+            if int(freq) >= 3:
+                clusters[word] = cluster
+            else:
+                clusters[word] = '0'
+    # Expand clusters with re-casing
+    for word, cluster in list(clusters.items()):
+        if word.lower() not in clusters:
+            clusters[word.lower()] = cluster
+        if word.title() not in clusters:
+            clusters[word.title()] = cluster
+        if word.upper() not in clusters:
+            clusters[word.upper()] = cluster
+    return clusters
+
+
+def populate_vocab(vocab, clusters, probs, oov_probs):
+    # Ensure probs has entries for all words seen during clustering.
+    for word in clusters:
+        if word not in probs:
+            probs[word] = oov_prob
+    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
+        lexeme = vocab[word]
+        lexeme.prob = prob
+        lexeme.is_oov = False
+        # Decode as a little-endian string, so that we can do & 15 to get
+        # the first 4 bits. See _parse_features.pyx
+        if word in clusters:
+            lexeme.cluster = int(clusters[word][::-1], 2)
+        else:
+            lexeme.cluster = 0
+
+
+def check_unzip(file_path):
+    file_path_str = file_path.as_posix()
+    if file_path_str.endswith('gz'):
+        return gzip.open(file_path_str)
+    else:
+        return file_path.open()
+
+
+def check_dirs(freqs_data, clusters_data, vectors_data):
+    if not freqs_data.is_file():
+        util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
+    if clusters_data and not clusters_data.is_file():
+        util.sys_exit(clusters_data.as_posix(), title="No Brown clusters file found")
+    if vectors_data and not vectors_data.is_file():
+        util.sys_exit(vectors_data.as_posix(), title="No word vectors file found")

From 7198cf1c8aa9d3d63772b263ec0cce1cbca15083 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 26 Mar 2017 20:56:05 +0200
Subject: [PATCH 3/5] Remove unused import

---
 spacy/cli/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/model.py b/spacy/cli/model.py
index 350023d5a..4cfd9a6f6 100644
--- a/spacy/cli/model.py
+++ b/spacy/cli/model.py
@@ -7,7 +7,7 @@ from ast import literal_eval
 from pathlib import Path
 from preshed.counter import PreshCounter
 
-from ..vocab import Vocab, write_binary_vectors
+from ..vocab import write_binary_vectors
 from .. import util
 
 

From d4a59c254bd30a83748586a85850e0cd351b20d9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 1 Apr 2017 10:19:01 +0200
Subject: [PATCH 4/5] Remove whitespace

---
 spacy/lexeme.pyx      | 30 +++++++++++++++---------------
 spacy/tokens/span.pyx | 26 +++++++++++++-------------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 1d5421d74..84338e281 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -87,7 +87,7 @@ cdef class Lexeme:
             value (bool): The new value of the flag.
         """
         Lexeme.c_set_flag(self.c, flag_id, value)
-    
+
     def check_flag(self, attr_id_t flag_id):
         """Check the value of a boolean flag.
 
@@ -137,7 +137,7 @@ cdef class Lexeme:
                     "\npython -m spacy.%s.download all\n"
                     "to install the data." % self.vocab.lang
                 )
- 
+
             vector_view = <float[:length,]>self.c.vector
             return numpy.asarray(vector_view)
 
@@ -163,7 +163,7 @@ cdef class Lexeme:
             return self.c.sentiment
         def __set__(self, float sentiment):
             self.c.sentiment = sentiment
-        
+
     property orth_:
         def __get__(self):
             return self.vocab.strings[self.c.orth]
@@ -171,7 +171,7 @@ cdef class Lexeme:
     property lower:
         def __get__(self): return self.c.lower
         def __set__(self, int x): self.c.lower = x
-    
+
     property norm:
         def __get__(self): return self.c.norm
         def __set__(self, int x): self.c.norm = x
@@ -187,11 +187,11 @@ cdef class Lexeme:
     property suffix:
         def __get__(self): return self.c.suffix
         def __set__(self, int x): self.c.suffix = x
-    
+
     property cluster:
         def __get__(self): return self.c.cluster
         def __set__(self, int x): self.c.cluster = x
- 
+
     property lang:
         def __get__(self): return self.c.lang
         def __set__(self, int x): self.c.lang = x
@@ -203,11 +203,11 @@ cdef class Lexeme:
     property lower_:
         def __get__(self): return self.vocab.strings[self.c.lower]
         def __set__(self, unicode x): self.c.lower = self.vocab.strings[x]
- 
+
     property norm_:
         def __get__(self): return self.vocab.strings[self.c.norm]
         def __set__(self, unicode x): self.c.norm = self.vocab.strings[x]
-    
+
     property shape_:
         def __get__(self): return self.vocab.strings[self.c.shape]
         def __set__(self, unicode x): self.c.shape = self.vocab.strings[x]
@@ -239,7 +239,7 @@ cdef class Lexeme:
     property is_alpha:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x)
-    
+
     property is_ascii:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x)
@@ -260,23 +260,23 @@ cdef class Lexeme:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x)
 
-    property is_space: 
+    property is_space:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
 
-    property is_bracket: 
+    property is_bracket:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
 
-    property is_quote: 
+    property is_quote:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
 
-    property is_left_punct: 
+    property is_left_punct:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
 
-    property is_right_punct: 
+    property is_right_punct:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
 
@@ -284,7 +284,7 @@ cdef class Lexeme:
     property like_url:
         def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
-    
+
     property like_num:
         def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index fc5d26174..37d99183c 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -128,13 +128,13 @@ cdef class Span:
             end = token_by_end(self.doc.c, self.doc.length, self.end_char)
             if end == -1:
                 raise IndexError("Error calculating span: Can't find end")
-            
+
             self.start = start
             self.end = end + 1
 
     property sent:
         '''The sentence span that this span is a part of.
-        
+
         Returns:
             Span The sentence this is part of.
         '''
@@ -157,7 +157,7 @@ cdef class Span:
             if 'has_vector' in self.doc.user_span_hooks:
                 return self.doc.user_span_hooks['has_vector'](self)
             return any(token.has_vector for token in self)
-    
+
     property vector:
         def __get__(self):
             if 'vector' in self.doc.user_span_hooks:
@@ -200,9 +200,9 @@ cdef class Span:
     property noun_chunks:
         '''
         Yields base noun-phrase #[code Span] objects, if the document
-        has been syntactically parsed. A base noun phrase, or 
-        'NP chunk', is a noun phrase that does not permit other NPs to 
-        be nested within it – so no NP-level coordination, no prepositional 
+        has been syntactically parsed. A base noun phrase, or
+        'NP chunk', is a noun phrase that does not permit other NPs to
+        be nested within it – so no NP-level coordination, no prepositional
         phrases, and no relative clauses. For example:
         '''
         def __get__(self):
@@ -227,19 +227,19 @@ cdef class Span:
 
         Returns:
             Token: The root token.
-        
+
         i.e. has the
         shortest path to the root of the sentence (or is the root itself).
 
         If multiple words are equally high in the tree, the first word is taken.
-        
+
         For example:
-        
+
         >>> toks = nlp(u'I like New York in Autumn.')
 
         Let's name the indices --- easier than writing "toks[4]" etc.
 
-        >>> i, like, new, york, in_, autumn, dot = range(len(toks)) 
+        >>> i, like, new, york, in_, autumn, dot = range(len(toks))
 
         The head of 'new' is 'York', and the head of 'York' is 'like'
 
@@ -301,10 +301,10 @@ cdef class Span:
                 return self.doc[self.start]
             else:
                 return self.doc[root]
-    
+
     property lefts:
         """Tokens that are to the left of the span, whose head is within the Span.
-        
+
         Yields: Token A left-child of a token of the span.
         """
         def __get__(self):
@@ -315,7 +315,7 @@ cdef class Span:
 
     property rights:
         """Tokens that are to the right of the Span, whose head is within the Span.
-        
+
         Yields: Token A right-child of a token of the span.
         """
         def __get__(self):

From 42382d56926ac545b47dfb5e9abcdbb4b7e469f8 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 1 Apr 2017 10:19:32 +0200
Subject: [PATCH 5/5] Fix download commands in error messages (see #946)

---
 spacy/lexeme.pyx       | 2 +-
 spacy/tokens/doc.pyx   | 4 ++--
 spacy/tokens/span.pyx  | 2 +-
 spacy/tokens/token.pyx | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 84338e281..3a26161bb 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -134,7 +134,7 @@ cdef class Lexeme:
                 raise ValueError(
                     "Word vectors set to length 0. This may be because the "
                     "data is not installed. If you haven't already, run"
-                    "\npython -m spacy.%s.download all\n"
+                    "\npython -m spacy download %s\n"
                     "to install the data." % self.vocab.lang
                 )
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index bda528383..2e1481d1b 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -394,7 +394,7 @@ cdef class Doc:
                 raise ValueError(
                     "noun_chunks requires the dependency parse, which "
                     "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy.%s.download all\n"
+                    "\npython -m spacy download %s\n"
                     "to install the data" % self.vocab.lang)
             # Accumulate the result before beginning to iterate over it. This prevents
             # the tokenisation from being changed out from under us during the iteration.
@@ -427,7 +427,7 @@ cdef class Doc:
                 raise ValueError(
                     "sentence boundary detection requires the dependency parse, which "
                     "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy.%s.download all\n"
+                    "\npython -m spacy download %s\n"
                     "to install the data" % self.vocab.lang)
             cdef int i
             start = 0
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 37d99183c..f43d47876 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -210,7 +210,7 @@ cdef class Span:
                 raise ValueError(
                     "noun_chunks requires the dependency parse, which "
                     "requires data to be installed. If you haven't done so, run: "
-                    "\npython -m spacy.%s.download all\n"
+                    "\npython -m spacy download %s\n"
                     "to install the data" % self.vocab.lang)
             # Accumulate the result before beginning to iterate over it. This prevents
             # the tokenisation from being changed out from under us during the iteration.
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 34de9dee7..b8e470437 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -236,7 +236,7 @@ cdef class Token:
                 raise ValueError(
                     "Word vectors set to length 0. This may be because the "
                     "data is not installed. If you haven't already, run"
-                    "\npython -m spacy.%s.download all\n"
+                    "\npython -m spacy download %s\n"
                     "to install the data." % self.vocab.lang
                 )
             vector_view = <float[:length,]>self.c.lex.vector