Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-20 14:50:01 +02:00
commit 3fe0d76e6d
9 changed files with 140 additions and 12 deletions

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a
You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

View File

@ -3,15 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert
from spacy.cli import download, link, info, package, train, convert, model
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
commands = {
'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model
}
if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
@ -19,5 +25,7 @@ if __name__ == '__main__':
if command in commands:
plac.call(commands[command])
else:
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)
prints(
"Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

View File

@ -4,3 +4,4 @@ from .link import link
from .package import package
from .train import train
from .convert import convert
from .model import model

119
spacy/cli/model.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from pathlib import Path
from preshed.counter import PreshCounter
import spacy
from ..compat import fix_text
from .. import util
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, nlp)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

View File

@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
| tie, the earliest is preferred.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')

View File

@ -39,7 +39,7 @@ p
+h(2, "special-cases") Adding special case tokenization rules
p
| Most domains have at least some idiosyncracies that require custom
| Most domains have at least some idiosyncrasies that require custom
| tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field.

View File

@ -109,7 +109,7 @@ p
| The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development enviroment consisting of a Python
| make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and

View File

@ -40,7 +40,7 @@ p
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
+row
+cell I read the paper yesteday
+cell I read the paper yesterday
+cell read
+cell read
+cell verb

View File

@ -94,7 +94,7 @@ p
| is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build
| process, this only adds an unecessary layer of complexity. If you know
| process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly.
p