Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-08-20 14:50:01 +02:00
commit 3fe0d76e6d
9 changed files with 140 additions and 12 deletions

View File

@ -229,7 +229,7 @@ Compile from source
The other way to install spaCy is to clone its The other way to install spaCy is to clone its
`GitHub repository <https://github.com/explosion/spaCy>`_ and build it from `GitHub repository <https://github.com/explosion/spaCy>`_ and build it from
source. That is the common way if you want to make changes to the code base. source. That is the common way if you want to make changes to the code base.
You'll need to make sure that you have a development enviroment consisting of a You'll need to make sure that you have a development environment consisting of a
Python distribution including header files, a compiler, Python distribution including header files, a compiler,
`pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_ `pip <https://pip.pypa.io/en/latest/installing/>`__, `virtualenv <https://virtualenv.pypa.io/>`_
and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest. and `git <https://git-scm.com>`_ installed. The compiler part is the trickiest.

View File

@ -3,15 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals #from __future__ import unicode_literals
if __name__ == '__main__': if __name__ == '__main__':
import plac import plac
import sys import sys
from spacy.cli import download, link, info, package, train, convert from spacy.cli import download, link, info, package, train, convert, model
from spacy.util import prints from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train, commands = {
'convert': convert, 'package': package} 'download': download,
'link': link,
'info': info,
'train': train,
'convert': convert,
'package': package,
'model': model
}
if len(sys.argv) == 1: if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1) prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1) command = sys.argv.pop(1)
@ -19,5 +25,7 @@ if __name__ == '__main__':
if command in commands: if command in commands:
plac.call(commands[command]) plac.call(commands[command])
else: else:
prints("Available: %s" % ', '.join(commands), prints(
title="Unknown command: %s" % command, exits=1) "Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command,
exits=1)

View File

@ -4,3 +4,4 @@ from .link import link
from .package import package from .package import package
from .train import train from .train import train
from .convert import convert from .convert import convert
from .model import model

119
spacy/cli/model.py Normal file
View File

@ -0,0 +1,119 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from pathlib import Path
from preshed.counter import PreshCounter
import spacy
from ..compat import fix_text
from .. import util
def model(cmd, lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = Path(model_dir)
freqs_path = Path(freqs_data)
clusters_path = Path(clusters_data) if clusters_data else None
vectors_path = Path(vectors_data) if vectors_data else None
check_dirs(freqs_path, clusters_path, vectors_path)
# vocab = util.get_lang_class(lang).Defaults.create_vocab()
nlp = spacy.blank(lang)
vocab = nlp.vocab
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, nlp)
def create_model(model_path, model):
if not model_path.exists():
model_path.mkdir()
model.to_disk(model_path.as_posix())
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i + 1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(
key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(
sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = file_path.as_posix()
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()
def check_dirs(freqs_data, clusters_data, vectors_data):
if not freqs_data.is_file():
util.sys_exit(freqs_data.as_posix(), title="No frequencies file found")
if clusters_data and not clusters_data.is_file():
util.sys_exit(
clusters_data.as_posix(), title="No Brown clusters file found")
if vectors_data and not vectors_data.is_file():
util.sys_exit(
vectors_data.as_posix(), title="No word vectors file found")

View File

@ -205,7 +205,7 @@ p Retokenize the document, such that the span is merged into a single token.
p p
| The token within the span that's highest in the parse tree. If there's a | The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered. | tie, the earliest is preferred.
+aside-code("Example"). +aside-code("Example").
doc = nlp(u'I like New York in Autumn.') doc = nlp(u'I like New York in Autumn.')

View File

@ -39,7 +39,7 @@ p
+h(2, "special-cases") Adding special case tokenization rules +h(2, "special-cases") Adding special case tokenization rules
p p
| Most domains have at least some idiosyncracies that require custom | Most domains have at least some idiosyncrasies that require custom
| tokenization rules. This could be very certain expressions, or | tokenization rules. This could be very certain expressions, or
| abbreviations only used in this specific field. | abbreviations only used in this specific field.

View File

@ -109,7 +109,7 @@ p
| The other way to install spaCy is to clone its | The other way to install spaCy is to clone its
| #[+a(gh("spaCy")) GitHub repository] and build it from source. That is | #[+a(gh("spaCy")) GitHub repository] and build it from source. That is
| the common way if you want to make changes to the code base. You'll need to | the common way if you want to make changes to the code base. You'll need to
| make sure that you have a development enviroment consisting of a Python | make sure that you have a development environment consisting of a Python
| distribution including header files, a compiler, | distribution including header files, a compiler,
| #[+a("https://pip.pypa.io/en/latest/installing/") pip], | #[+a("https://pip.pypa.io/en/latest/installing/") pip],
| #[+a("https://virtualenv.pypa.io/") virtualenv] and | #[+a("https://virtualenv.pypa.io/") virtualenv] and

View File

@ -40,7 +40,7 @@ p
+cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres] +cell #[code VerbForm=Fin], #[code Mood=Ind], #[code Tense=Pres]
+row +row
+cell I read the paper yesteday +cell I read the paper yesterday
+cell read +cell read
+cell read +cell read
+cell verb +cell verb

View File

@ -94,7 +94,7 @@ p
| is mostly intended as a convenient, interactive wrapper. It performs | is mostly intended as a convenient, interactive wrapper. It performs
| compatibility checks and prints detailed error messages and warnings. | compatibility checks and prints detailed error messages and warnings.
| However, if you're downloading models as part of an automated build | However, if you're downloading models as part of an automated build
| process, this only adds an unecessary layer of complexity. If you know | process, this only adds an unnecessary layer of complexity. If you know
| which models your application needs, you should be specifying them directly. | which models your application needs, you should be specifying them directly.
p p