mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge remote-tracking branch 'refs/remotes/honnibal/master'
Conflicts: appveyor
This commit is contained in:
commit
a728374806
|
@ -18,11 +18,11 @@ environment:
|
|||
|
||||
# The lastest Python 3.4.
|
||||
- PYTHON: "C:\\Python34-x64"
|
||||
PYTHON_VERSION: "3.4.x" # currently 3.4.3
|
||||
PYTHON_VERSION: "3.4.3" # currently 3.4.3
|
||||
PYTHON_ARCH: "64"
|
||||
|
||||
- PYTHON: "C:\\Python34-x32"
|
||||
PYTHON_VERSION: "3.4.x" # currently 3.4.3
|
||||
PYTHON_VERSION: "3.4.3" # currently 3.4.3
|
||||
PYTHON_ARCH: "32"
|
||||
|
||||
|
||||
|
|
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -1,3 +0,0 @@
|
|||
[submodule "appveyor"]
|
||||
path = appveyor
|
||||
url = https://github.com/maxirmx/spaCy-appveyor-toolkit
|
|
@ -24,4 +24,4 @@ install:
|
|||
|
||||
# run tests
|
||||
script:
|
||||
- "py.test tests/ -x"
|
||||
- "py.test spacy/ -x"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
<img src="https://ci.appveyor.com/api/projects/status/aoe3dtkep36rdaqf?svg=true" />
|
||||
[![Travis CI status](https://travis-ci.org/honnibal/spaCy.svg?branch=master)](https://travis-ci.org/honnibal/spaCy)
|
||||
![Appveyor status](https://ci.appveyor.com/api/projects/status/aoe3dtkep36rdaqf?svg=true)
|
||||
|
||||
spaCy: Industrial-strength NLP
|
||||
==============================
|
||||
|
|
25
fabfile.py
vendored
25
fabfile.py
vendored
|
@ -1,9 +1,12 @@
|
|||
from __future__ import print_function
|
||||
|
||||
from fabric.api import local, lcd, env, settings, prefix
|
||||
from os.path import exists as file_exists
|
||||
from fabtools.python import virtualenv
|
||||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
PWD = path.dirname(__file__)
|
||||
|
@ -56,10 +59,28 @@ def prebuild(build_dir='/tmp/build_spacy'):
|
|||
local('fab test')
|
||||
local('python setup.py sdist')
|
||||
|
||||
|
||||
def docs():
|
||||
def jade(source_name, out_dir):
|
||||
pwd = path.join(path.dirname(__file__), 'website')
|
||||
jade_loc = path.join(pwd, 'src', 'jade', source_name)
|
||||
out_loc = path.join(pwd, 'site', out_dir)
|
||||
local('jade -P %s --out %s' % (jade_loc, out_loc))
|
||||
|
||||
with virtualenv(VENV_DIR):
|
||||
with lcd(path.join(path.dirname(__file__), 'docs')):
|
||||
local('make html')
|
||||
local('./website/create_code_samples tests/website/ website/src/code/')
|
||||
|
||||
jade('home/index.jade', '')
|
||||
jade('docs/index.jade', 'docs/')
|
||||
jade('blog/index.jade', 'blog/')
|
||||
jade('tutorials/index.jade', 'tutorials/')
|
||||
|
||||
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / 'blog').iterdir():
|
||||
if post_dir.is_dir() \
|
||||
and (post_dir / 'index.jade').exists() \
|
||||
and (post_dir / 'meta.jade').exists():
|
||||
jade(str(post_dir / 'index.jade'), path.join('blogs', post_dir.parts[-1]))
|
||||
|
||||
|
||||
def publish(version):
|
||||
with virtualenv(VENV_DIR):
|
||||
|
|
18
setup.py
18
setup.py
|
@ -128,7 +128,7 @@ def cython_setup(mod_names, language, includes):
|
|||
author_email='honnibal@gmail.com',
|
||||
version=VERSION,
|
||||
url="http://honnibal.github.io/spaCy/",
|
||||
package_data={"spacy": ["*.pxd"],
|
||||
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
|
||||
"spacy.tokens": ["*.pxd"],
|
||||
"spacy.serialize": ["*.pxd"],
|
||||
"spacy.en": ["*.pxd", "data/pos/*",
|
||||
|
@ -139,7 +139,7 @@ def cython_setup(mod_names, language, includes):
|
|||
"spacy.syntax": ["*.pxd"]},
|
||||
ext_modules=exts,
|
||||
cmdclass={'build_ext': build_ext_cython_subclass},
|
||||
license="Dual: Commercial or AGPL",
|
||||
license="MIT",
|
||||
)
|
||||
|
||||
|
||||
|
@ -147,7 +147,19 @@ def run_setup(exts):
|
|||
setup(
|
||||
name='spacy',
|
||||
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
|
||||
'spacy.syntax', 'spacy.munge'],
|
||||
'spacy.syntax', 'spacy.munge',
|
||||
'spacy.tests',
|
||||
'spacy.tests.matcher',
|
||||
'spacy.tests.morphology',
|
||||
'spacy.tests.munge',
|
||||
'spacy.tests.parser',
|
||||
'spacy.tests.serialize',
|
||||
'spacy.tests.spans',
|
||||
'spacy.tests.tagger',
|
||||
'spacy.tests.tokenizer',
|
||||
'spacy.tests.tokens',
|
||||
'spacy.tests.vectors',
|
||||
'spacy.tests.vocab'],
|
||||
description="Industrial-strength NLP",
|
||||
author='Matthew Honnibal',
|
||||
author_email='honnibal@gmail.com',
|
||||
|
|
27
spacy/tests/test_pickle.py
Normal file
27
spacy/tests/test_pickle.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import cloudpickle
|
||||
import io
|
||||
import os
|
||||
import pickle
|
||||
import pytest
|
||||
import tempfile
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pickle_english(EN):
|
||||
file_ = io.BytesIO()
|
||||
cloudpickle.dump(EN, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
assert loaded is not None
|
||||
|
||||
@pytest.mark.models
|
||||
def test_cloudpickle_to_file(EN):
|
||||
f = tempfile.NamedTemporaryFile(delete=False)
|
||||
p = cloudpickle.CloudPickler(f)
|
||||
p.dump(EN)
|
||||
f.close()
|
||||
loaded_en = cloudpickle.load(open(f.name))
|
||||
os.unlink(f.name)
|
||||
doc = loaded_en(unicode('test parse'))
|
||||
assert len(doc) == 2
|
|
@ -2,6 +2,19 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import io
|
||||
import pickle
|
||||
import cloudpickle
|
||||
import tempfile
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pickle(en_tokenizer):
|
||||
file_ = io.BytesIO()
|
||||
cloudpickle.dump(en_tokenizer, file_)
|
||||
file_.seek(0)
|
||||
loaded = pickle.load(file_)
|
||||
assert loaded is not None
|
||||
|
||||
|
||||
def test_no_word(en_tokenizer):
|
||||
|
@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer):
|
|||
# text = """Today is Tuesday.Mr."""
|
||||
# tokens = en_tokenizer(text)
|
||||
# assert len(tokens) == 5
|
||||
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||
# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.']
|
||||
|
||||
|
||||
def test_cnts6(en_tokenizer):
|
|
@ -19,6 +19,7 @@ cdef class Tokenizer:
|
|||
cdef object _prefix_re
|
||||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
cdef object _rules
|
||||
|
||||
cpdef Doc tokens_from_list(self, list strings)
|
||||
|
||||
|
|
|
@ -29,6 +29,16 @@ cdef class Tokenizer:
|
|||
self._infix_re = infix_re
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules)
|
||||
self._rules = rules
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
self._prefix_re,
|
||||
self._suffix_re,
|
||||
self._infix_re)
|
||||
|
||||
return (self.__class__, args, None, None)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, data_dir):
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
import pytest
|
||||
import io
|
||||
import cloudpickle
|
||||
import pickle
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_pickle_english(EN):
|
||||
file_ = io.BytesIO()
|
||||
cloudpickle.dump(EN, file_)
|
||||
|
||||
file_.seek(0)
|
||||
|
||||
loaded = pickle.load(file_)
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
all: src/code site
|
||||
|
||||
src/code:
|
||||
mkdir -p src/code/
|
||||
./create_code_samples ../tests/website/ src/code/
|
||||
|
||||
site: site/index.html site/blog/ site/docs/ site/license/ site/blog/introducing-spacy/ site/blog/parsing-english-in-python/ site/blog/part-of-speech-POS-tagger-in-python/ site/tutorials/twitter-filter/ site/tutorials/syntax-search/ site/tutorials/mark-adverbs/ site/blog/writing-c-in-cython/ site/blog/how-spacy-works/
|
||||
|
||||
site/index.html: src/jade/header.jade src/jade/*.jade
|
||||
jade -P src/jade/home/index.jade --out site/
|
||||
|
||||
site/docs/: src/jade/docs/*.jade src/jade/header.jade
|
||||
jade -P src/jade/docs/index.jade --out $@
|
||||
|
||||
site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade
|
||||
jade -P src/jade/blog/index.jade --out $@
|
||||
|
||||
site/tutorials/: src/jade/tutorials/*.jade site/tutorials/*/ src/jade/header.jade
|
||||
jade -P src/jade/tutorials/index.jade --out $@
|
||||
|
||||
site/blog/parsing-english-in-python/: src/jade/blog/parsing-english-in-python/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
|
||||
site/blog/writing-c-in-cython/: src/jade/blog/writing-c-in-cython/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
|
||||
site/blog/part-of-speech-POS-tagger-in-python/: src/jade/blog/part-of-speech-POS-tagger-in-python/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/blog/introducing-spacy/: src/jade/blog/introducing-spacy/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/blog/displacy/: src/jade/blog/displacy/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/blog/eli5-computers-learn-reading: src/jade/blog/eli5-computers-learn-reading/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/tutorials/mark-adverbs/: src/jade/tutorials/mark-adverbs/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/blog/how-spacy-works/: src/jade/blog/how-spacy-works/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/tutorials/syntax-search/: src/jade/tutorials/syntax-search/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
||||
site/tutorials/twitter-filter/: src/jade/tutorials/twitter-filter/*.jade src/jade/header.jade
|
||||
jade -P $< --out $@
|
||||
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import os
|
||||
import ast
|
||||
import io
|
||||
import re
|
||||
|
||||
import plac
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ mixin comparison(name)
|
|||
+columns("System", "Language", "Accuracy", "Speed")
|
||||
|
||||
tbody
|
||||
+row("spaCy v0.93", "Cython", "91.8", "13,000 (est.)")
|
||||
+row("spaCy v0.97", "Cython", "91.8", "13,000 (est.)")
|
||||
+row("ClearNLP", "Java", "91.7", "10,271")
|
||||
+row("CoreNLP", "Java", "89.6", "8,602")
|
||||
+row("MATE", "Java", "92.5", "550")
|
||||
|
@ -80,7 +80,7 @@ mixin comparison(name)
|
|||
li.con English only
|
||||
li.pro Python
|
||||
.col
|
||||
h5 CoreNLP features:
|
||||
h5 CoreNLP
|
||||
ul
|
||||
li.pro More accurate NER
|
||||
li.pro Coreference resolution
|
||||
|
@ -103,7 +103,7 @@ mixin comparison(name)
|
|||
li.pro Python
|
||||
|
||||
.col
|
||||
h5 ClearNLP:
|
||||
h5 ClearNLP
|
||||
ul
|
||||
li.pro Semantic Role Labelling
|
||||
li.pro Model for biology/life-science
|
||||
|
|
|
@ -10,7 +10,7 @@ mixin Option(name, open)
|
|||
pre.language-bash
|
||||
code
|
||||
$ pip install --upgrade spacy
|
||||
$ python -m spacy.en.download all
|
||||
$ python -m spacy.en.download --force all
|
||||
p Most updates ship a new model, so you will usually have to redownload the data.
|
||||
|
||||
|
||||
|
@ -93,6 +93,17 @@ mixin Option(name, open)
|
|||
h4 What's New?
|
||||
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-10-24 v0.97: Reduce load time, bug fixes
|
||||
|
||||
ul
|
||||
li Load the StringStore from a json list, instead of a text file. Accept a file-like object in the API instead of a path, for better flexibility.
|
||||
li * Load from file, rather than path, in StringStore
|
||||
li Fix bugs in download.py
|
||||
li Require #[code --force] to over-write the data directory in download.py
|
||||
li Fix bugs in #[code Matcher] and #[code doc.merge()]
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-09-21 v0.93: Bug fixes to word vectors. Rename .repvec to .vector. Rename .string attribute.
|
||||
|
|
|
@ -29,10 +29,10 @@ include ../header.jade
|
|||
li: a.button(href="#example-use") Examples
|
||||
li: a.button(href="#install")
|
||||
| Install
|
||||
<span class="button-caption">v0.94</span>
|
||||
<span class="button-caption">v0.97</span>
|
||||
|
||||
article.page.landing-page
|
||||
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
|
||||
+Section("Online Demo", "online-demo", "./_online_demo.jade")
|
||||
+Section("Usage by Example", "example-use", "./_usage_examples.jade")
|
||||
+Section("Install v0.94", "install", "./_installation.jade")
|
||||
+Section("Install v0.97", "install", "./_installation.jade")
|
||||
|
|
Loading…
Reference in New Issue
Block a user