From 5ca1646d8af090283cdc3f832af4a24141f5a952 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2015 18:07:11 +1000 Subject: [PATCH 1/3] * Mark model-requiring tests --- website/tests/test_home.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/tests/test_home.py b/website/tests/test_home.py index ed710e107..3b27ba2f7 100644 --- a/website/tests/test_home.py +++ b/website/tests/test_home.py @@ -24,6 +24,7 @@ def test_load_resources_and_process_text(): doc = nlp('Hello, world. Here are two sentences.') +@pytest.mark.models def test_get_tokens_and_sentences(doc): token = doc[0] sentence = doc.sents.next() @@ -65,6 +66,7 @@ def test_export_to_numpy_arrays(nlp, doc): assert list(doc_array[:, 1]) == [t.like_url for t in doc] +@pytest.mark.models def test_word_vectors(nlp): doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") @@ -96,6 +98,7 @@ def test_part_of_speech_tags(nlp): print(token.tag_) +@pytest.mark.models def test_syntactic_dependencies(): def dependency_labels_to_root(token): '''Walk up the syntactic tree, collecting the arc labels.''' @@ -106,6 +109,7 @@ def test_syntactic_dependencies(): return dep_labels +@pytest.mark.models def test_named_entities(): def iter_products(docs): for doc in docs: @@ -151,6 +155,7 @@ def test_calculate_inline_mark_up_on_original_string(): return string +@pytest.mark.models def test_efficient_binary_serialization(doc): byte_string = doc.as_bytes() open('/tmp/moby_dick.bin', 'wb').write(byte_string) From d8276b3792960cbbafe6718fcc2d307a2292532e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2015 19:34:34 +1000 Subject: [PATCH 2/3] * Move test_home to within tests/ --- {website/tests => tests/website}/test_home.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) rename {website/tests => tests/website}/test_home.py (95%) diff --git a/website/tests/test_home.py b/tests/website/test_home.py similarity index 95% rename from website/tests/test_home.py rename to tests/website/test_home.py index 3b27ba2f7..8c1ab9b8e 100644 --- a/website/tests/test_home.py +++ b/tests/website/test_home.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals import pytest +import spacy.en @pytest.fixture(scope="session") @@ -36,7 +37,7 @@ def test_use_integer_ids_for_any_strings(nlp, token): hello_id = nlp.vocab.strings['Hello'] hello_str = nlp.vocab.strings[hello_id] - assert token.orth == hello_id == 469755 + assert token.orth == hello_id == 3404 assert token.orth_ == hello_str == 'Hello' @@ -71,7 +72,7 @@ def test_word_vectors(nlp): doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") apples = doc[0] - oranges = doc[1] + oranges = doc[2] boots = doc[6] hippos = doc[8] @@ -157,10 +158,12 @@ def test_calculate_inline_mark_up_on_original_string(): @pytest.mark.models def test_efficient_binary_serialization(doc): - byte_string = doc.as_bytes() + from spacy.tokens.doc import Doc + + byte_string = doc.to_bytes() open('/tmp/moby_dick.bin', 'wb').write(byte_string) nlp = spacy.en.English() - for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): + for byte_string in Doc.read_bytes(open('/tmp/moby_dick.bin', 'rb')): doc = Doc(nlp.vocab) doc.from_bytes(byte_string) From c03e74272b999a45ea7e46ed7f0ad69409e56a01 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Sep 2015 21:54:44 +1000 Subject: [PATCH 3/3] * Remove extraneous file. --- dev_setup.py | 137 --------------------------------------------------- 1 file changed, 137 deletions(-) delete mode 100644 dev_setup.py diff --git a/dev_setup.py b/dev_setup.py deleted file mode 100644 index 8efaba40b..000000000 --- a/dev_setup.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python - -import subprocess - -# This is what we're down to... -try: - import Cython -except ImportError: - subprocess.call(['pip install cython'], shell=True) - -try: - import murmurhash -except ImportError: - subprocess.call(['pip install murmurhash'], shell=True) - -try: - import cymem -except ImportError: - subprocess.call(['pip install cymem'], shell=True) - -try: - import preshed -except ImportError: - subprocess.call(['pip install preshed'], shell=True) - -try: - import thinc -except ImportError: - subprocess.call(['pip install thinc'], shell=True) - -try: - import numpy -except ImportError: - subprocess.call(['pip install numpy'], shell=True) - - -import Cython.Distutils -from Cython.Distutils import Extension -import distutils.core - -import sys -import os -import os.path - -from os import path -from glob import glob - -import numpy - - -def clean(ext): - for pyx in ext.sources: - if pyx.endswith('.pyx'): - c = pyx[:-4] + '.c' - cpp = pyx[:-4] + '.cpp' - so = pyx[:-4] + '.so' - html = pyx[:-4] + '.html' - if os.path.exists(so): - os.unlink(so) - if os.path.exists(c): - os.unlink(c) - elif os.path.exists(cpp): - os.unlink(cpp) - if os.path.exists(html): - os.unlink(html) - -HERE = os.path.dirname(__file__) -virtual_env = os.environ.get('VIRTUAL_ENV', '') -compile_args = [] -link_args = [] -libs = [] - -includes = ['.', numpy.get_include()] -cython_includes = ['.'] - - -if 'VIRTUAL_ENV' in os.environ: - includes += glob(path.join(os.environ['VIRTUAL_ENV'], 'include', 'site', '*')) -else: - # If you're not using virtualenv, set your include dir here. - pass - -ext_args = {'language': "c++", "include_dirs": includes} - -exts = [ - Extension("spacy.typedefs", ["spacy/typedefs.pyx"], **ext_args), - Extension("spacy.strings", ["spacy/strings.pyx"], **ext_args), - Extension("spacy.lexeme", ["spacy/lexeme.pyx"], **ext_args), - Extension("spacy.vocab", ["spacy/vocab.pyx"], **ext_args), - Extension("spacy.tokens", ["spacy/tokens.pyx"], **ext_args), - Extension("spacy.morphology", ["spacy/morphology.pyx"], **ext_args), - - Extension("spacy._ml", ["spacy/_ml.pyx"], **ext_args), - - Extension("spacy.tokenizer", ["spacy/tokenizer.pyx"], **ext_args), - Extension("spacy.en.attrs", ["spacy/en/attrs.pyx"], **ext_args), - Extension("spacy.en.pos", ["spacy/en/pos.pyx"], **ext_args), - Extension("spacy.syntax.parser", ["spacy/syntax/parser.pyx"], **ext_args), - Extension("spacy.syntax._state", ["spacy/syntax/_state.pyx"], **ext_args), - Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args), - Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"], - **ext_args) - - #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.io_moves", ["spacy/ner/io_moves.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.greedy_parser", ["spacy/ner/greedy_parser.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.pystate", ["spacy/ner/pystate.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.context", ["spacy/ner/context.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.feats", ["spacy/ner/feats.pyx"], language="c++", include_dirs=includes), - #Extension("spacy.ner.annot", ["spacy/ner/annot.pyx"], language="c++", include_dirs=includes), -] - - -if sys.argv[1] == 'clean': - print >> sys.stderr, "cleaning .c, .c++ and .so files matching sources" - map(clean, exts) - -distutils.core.setup( - name='spacy', - packages=['spacy', 'spacy.en', 'spacy.syntax'], - description="Industrial-strength NLP", - author='Matthew Honnibal', - author_email='honnibal@gmail.com', - version='0.1', - url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd"], "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/*"], - "spacy.syntax": ["*.pxd"]}, - cmdclass={'build_ext': Cython.Distutils.build_ext}, - ext_modules=exts, - license="Dual: Commercial or AGPL", - requires=['cython', 'murmurhash', 'cymem', 'preshed', 'thinc', "unidecode", - "ujson"] -)