Merge remote-tracking branch 'refs/remotes/honnibal/master'

Conflicts:
	appveyor
This commit is contained in:
maxirmx 2015-10-25 20:09:19 +03:00
commit a728374806
90 changed files with 113 additions and 87 deletions

View File

@ -18,11 +18,11 @@ environment:
# The lastest Python 3.4.
- PYTHON: "C:\\Python34-x64"
PYTHON_VERSION: "3.4.x" # currently 3.4.3
PYTHON_VERSION: "3.4.3" # currently 3.4.3
PYTHON_ARCH: "64"
- PYTHON: "C:\\Python34-x32"
PYTHON_VERSION: "3.4.x" # currently 3.4.3
PYTHON_VERSION: "3.4.3" # currently 3.4.3
PYTHON_ARCH: "32"

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "appveyor"]
path = appveyor
url = https://github.com/maxirmx/spaCy-appveyor-toolkit

View File

@ -24,4 +24,4 @@ install:
# run tests
script:
- "py.test tests/ -x"
- "py.test spacy/ -x"

View File

@ -1,5 +1,5 @@
<img src="https://ci.appveyor.com/api/projects/status/aoe3dtkep36rdaqf?svg=true" />
[![Travis CI status](https://travis-ci.org/honnibal/spaCy.svg?branch=master)](https://travis-ci.org/honnibal/spaCy)
![Appveyor status](https://ci.appveyor.com/api/projects/status/aoe3dtkep36rdaqf?svg=true)
spaCy: Industrial-strength NLP
==============================

25
fabfile.py vendored
View File

@ -1,9 +1,12 @@
from __future__ import print_function
from fabric.api import local, lcd, env, settings, prefix
from os.path import exists as file_exists
from fabtools.python import virtualenv
from os import path
import os
import shutil
from pathlib import Path
PWD = path.dirname(__file__)
@ -56,10 +59,28 @@ def prebuild(build_dir='/tmp/build_spacy'):
local('fab test')
local('python setup.py sdist')
def docs():
def jade(source_name, out_dir):
pwd = path.join(path.dirname(__file__), 'website')
jade_loc = path.join(pwd, 'src', 'jade', source_name)
out_loc = path.join(pwd, 'site', out_dir)
local('jade -P %s --out %s' % (jade_loc, out_loc))
with virtualenv(VENV_DIR):
with lcd(path.join(path.dirname(__file__), 'docs')):
local('make html')
local('./website/create_code_samples tests/website/ website/src/code/')
jade('home/index.jade', '')
jade('docs/index.jade', 'docs/')
jade('blog/index.jade', 'blog/')
jade('tutorials/index.jade', 'tutorials/')
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / 'blog').iterdir():
if post_dir.is_dir() \
and (post_dir / 'index.jade').exists() \
and (post_dir / 'meta.jade').exists():
jade(str(post_dir / 'index.jade'), path.join('blogs', post_dir.parts[-1]))
def publish(version):
with virtualenv(VENV_DIR):

View File

@ -128,7 +128,7 @@ def cython_setup(mod_names, language, includes):
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd"],
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
@ -139,7 +139,7 @@ def cython_setup(mod_names, language, includes):
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
license="Dual: Commercial or AGPL",
license="MIT",
)
@ -147,7 +147,19 @@ def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge'],
'spacy.syntax', 'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',

View File

@ -0,0 +1,27 @@
import cloudpickle
import io
import os
import pickle
import pytest
import tempfile
@pytest.mark.models
def test_pickle_english(EN):
file_ = io.BytesIO()
cloudpickle.dump(EN, file_)
file_.seek(0)
loaded = pickle.load(file_)
assert loaded is not None
@pytest.mark.models
def test_cloudpickle_to_file(EN):
f = tempfile.NamedTemporaryFile(delete=False)
p = cloudpickle.CloudPickler(f)
p.dump(EN)
f.close()
loaded_en = cloudpickle.load(open(f.name))
os.unlink(f.name)
doc = loaded_en(unicode('test parse'))
assert len(doc) == 2

View File

@ -2,6 +2,19 @@
from __future__ import unicode_literals
import pytest
import io
import pickle
import cloudpickle
import tempfile
@pytest.mark.models
def test_pickle(en_tokenizer):
file_ = io.BytesIO()
cloudpickle.dump(en_tokenizer, file_)
file_.seek(0)
loaded = pickle.load(file_)
assert loaded is not None
def test_no_word(en_tokenizer):
@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer):
# text = """Today is Tuesday.Mr."""
# tokens = en_tokenizer(text)
# assert len(tokens) == 5
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
def test_cnts6(en_tokenizer):

View File

@ -19,6 +19,7 @@ cdef class Tokenizer:
cdef object _prefix_re
cdef object _suffix_re
cdef object _infix_re
cdef object _rules
cpdef Doc tokens_from_list(self, list strings)

View File

@ -29,6 +29,16 @@ cdef class Tokenizer:
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules)
self._rules = rules
def __reduce__(self):
args = (self.vocab,
self._rules,
self._prefix_re,
self._suffix_re,
self._infix_re)
return (self.__class__, args, None, None)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):

View File

@ -1,15 +0,0 @@
import pytest
import io
import cloudpickle
import pickle
@pytest.mark.models
def test_pickle_english(EN):
file_ = io.BytesIO()
cloudpickle.dump(EN, file_)
file_.seek(0)
loaded = pickle.load(file_)

View File

@ -1,52 +0,0 @@
all: src/code site
src/code:
mkdir -p src/code/
./create_code_samples ../tests/website/ src/code/
site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/
site/index.html: src/jade/header.jade src/jade/*.jade
jade -P src/jade/home/index.jade --out site/
site/docs/: src/jade/docs/*.jade src/jade/header.jade
jade -P src/jade/docs/index.jade --out $@
site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade
jade -P src/jade/blog/index.jade --out $@
site/tutorials/: src/jade/tutorials/*.jade site/tutorials/*/ src/jade/header.jade
jade -P src/jade/tutorials/index.jade --out $@
site/blog/parsing-english-in-python/: src/jade/blog/parsing-english-in-python/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/writing-c-in-cython/: src/jade/blog/writing-c-in-cython/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/part-of-speech-POS-tagger-in-python/: src/jade/blog/part-of-speech-POS-tagger-in-python/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/introducing-spacy/: src/jade/blog/introducing-spacy/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/displacy/: src/jade/blog/displacy/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/eli5-computers-learn-reading: src/jade/blog/eli5-computers-learn-reading/*.jade src/jade/header.jade
jade -P $< --out $@
site/tutorials/mark-adverbs/: src/jade/tutorials/mark-adverbs/*.jade src/jade/header.jade
jade -P $< --out $@
site/blog/how-spacy-works/: src/jade/blog/how-spacy-works/*.jade src/jade/header.jade
jade -P $< --out $@
site/tutorials/syntax-search/: src/jade/tutorials/syntax-search/*.jade src/jade/header.jade
jade -P $< --out $@
site/tutorials/twitter-filter/: src/jade/tutorials/twitter-filter/*.jade src/jade/header.jade
jade -P $< --out $@

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import os
import ast
import io
import re
import plac

View File

@ -35,7 +35,7 @@ mixin comparison(name)
+columns("System", "Language", "Accuracy", "Speed")
tbody
+row("spaCy v0.93", "Cython", "91.8", "13,000 (est.)")
+row("spaCy v0.97", "Cython", "91.8", "13,000 (est.)")
+row("ClearNLP", "Java", "91.7", "10,271")
+row("CoreNLP", "Java", "89.6", "8,602")
+row("MATE", "Java", "92.5", "550")
@ -80,7 +80,7 @@ mixin comparison(name)
li.con English only
li.pro Python
.col
h5 CoreNLP features:
h5 CoreNLP
ul
li.pro More accurate NER
li.pro Coreference resolution
@ -103,7 +103,7 @@ mixin comparison(name)
li.pro Python
.col
h5 ClearNLP:
h5 ClearNLP
ul
li.pro Semantic Role Labelling
li.pro Model for biology/life-science

View File

@ -10,7 +10,7 @@ mixin Option(name, open)
pre.language-bash
code
$ pip install --upgrade spacy
$ python -m spacy.en.download all
$ python -m spacy.en.download --force all
p Most updates ship a new model, so you will usually have to redownload the data.
@ -93,6 +93,17 @@ mixin Option(name, open)
h4 What's New?
details
summary
h4 2015-10-24 v0.97: Reduce load time, bug fixes
ul
li Load the StringStore from a json list, instead of a text file. Accept a file-like object in the API instead of a path, for better flexibility.
li * Load from file, rather than path, in StringStore
li Fix bugs in download.py
li Require #[code --force] to over-write the data directory in download.py
li Fix bugs in #[code Matcher] and #[code doc.merge()]
details
summary
h4 2015-09-21 v0.93: Bug fixes to word vectors. Rename .repvec to .vector. Rename .string attribute.

View File

@ -29,10 +29,10 @@ include ../header.jade
li: a.button(href="#example-use") Examples
li: a.button(href="#install")
| Install
<span class="button-caption">v0.94</span>
<span class="button-caption">v0.97</span>
article.page.landing-page
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
+Section("Online Demo", "online-demo", "./_online_demo.jade")
+Section("Usage by Example", "example-use", "./_usage_examples.jade")
+Section("Install v0.94", "install", "./_installation.jade")
+Section("Install v0.97", "install", "./_installation.jade")