assign lex after spans, add tests

This commit is contained in:
Andreas Grivas 2015-11-04 19:49:22 +02:00
parent 015a84a5ec
commit 93918b5c23
3 changed files with 258 additions and 6 deletions

230
setup.py Normal file
View File

@ -0,0 +1,230 @@
#!/usr/bin/env python
from setuptools import setup
import shutil
import sys
import os
from os import path
from setuptools import Extension
from distutils import sysconfig
from distutils.core import setup, Extension
from distutils.command.build_ext import build_ext
import platform
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
compile_options = {'msvc' : ['/Ox', '/EHsc'] ,
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] }
link_options = {'msvc' : [] ,
'other' : [] }
class build_ext_options:
def build_options(self):
c_type = None
if self.compiler.compiler_type in compile_options:
c_type = self.compiler.compiler_type
elif 'other' in compile_options:
c_type = 'other'
if c_type is not None:
for e in self.extensions:
e.extra_compile_args = compile_options[c_type]
l_type = None
if self.compiler.compiler_type in link_options:
l_type = self.compiler.compiler_type
elif 'other' in link_options:
l_type = 'other'
if l_type is not None:
for e in self.extensions:
e.extra_link_args = link_options[l_type]
class build_ext_subclass( build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
build_ext.build_extensions(self)
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
# this is necessary to get it compile.
# We have to resort to monkey-patching to set the compiler, because pypy broke
# all the everything.
pre_patch_customize_compiler = sysconfig.customize_compiler
def my_customize_compiler(compiler):
pre_patch_customize_compiler(compiler)
compiler.compiler_cxx = ['c++']
if platform.python_implementation() == 'PyPy':
sysconfig.customize_compiler = my_customize_compiler
#def install_headers():
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
# if not path.exists(dest_dir):
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
#
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
includes = ['.', path.join(sys.prefix, 'include')]
try:
import numpy
numpy_headers = path.join(numpy.get_include(), 'numpy')
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
except ImportError:
pass
except OSError:
pass
def clean(mod_names):
for name in mod_names:
name = name.replace('.', '/')
so = name + '.so'
html = name + '.html'
cpp = name + '.cpp'
c = name + '.c'
for file_path in [so, html, cpp, c]:
if os.path.exists(file_path):
os.unlink(file_path)
def name_to_path(mod_name, ext):
return '%s.%s' % (mod_name.replace('.', '/'), ext)
def c_ext(mod_name, language, includes):
mod_path = name_to_path(mod_name, language)
return Extension(mod_name, [mod_path], include_dirs=includes)
def cython_setup(mod_names, language, includes):
import Cython.Distutils
import Cython.Build
import distutils.core
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
Cython.Distutils.build_ext.build_extensions(self)
if language == 'cpp':
language = 'c++'
exts = []
for mod_name in mod_names:
mod_path = mod_name.replace('.', '/') + '.pyx'
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
exts.append(e)
distutils.core.setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/tag_map.json",
"data/vocab/lexemes.bin",
"data/vocab/strings.json"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
license="MIT",
)
def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/lexemes.bin",
"data/vocab/serializer.json",
"data/vocab/oov_prob",
"data/vocab/strings.txt"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
license="MIT",
install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)
import headers_workaround
headers_workaround.fix_venv_pypy_include()
headers_workaround.install_headers('murmurhash')
headers_workaround.install_headers('numpy')
VERSION = '0.97'
def main(modules, is_pypy):
language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')]
if sys.platform.startswith('darwin'):
compile_options['other'].append('-mmacosx-version-min=10.8')
compile_options['other'].append('-stdlib=libc++')
link_options['other'].append('-lc++')
if use_cython:
cython_setup(modules, language, includes)
else:
exts = [c_ext(mn, language, includes)
for mn in modules]
run_setup(exts)
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':
if sys.argv[1] == 'clean':
clean(MOD_NAMES)
else:
use_cython = sys.argv[1] == 'build_ext'
main(MOD_NAMES, use_cython)

View File

@ -1,7 +1,6 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_merge_tokens(EN):
tokens = EN(u'Los Angeles start.')
assert len(tokens) == 4
@ -13,7 +12,6 @@ def test_merge_tokens(EN):
assert tokens[0].head.orth_ == 'start'
@pytest.mark.models
def test_merge_heads(EN):
tokens = EN(u'I found a pilates class near work.')
assert len(tokens) == 8
@ -32,7 +30,6 @@ def test_issue_54(EN):
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
tokens = EN(text)
@pytest.mark.models
def test_np_merges(EN):
text = u'displaCy is a parse tool built with Javascript'
tokens = EN(text)
@ -47,3 +44,27 @@ def test_np_merges(EN):
merged = tokens.merge(start, end, label, lemma, label)
assert merged != None, (start, end, label, lemma)
def test_entity_merge(EN):
tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
assert(len(tokens) == 15)
for ent in tokens.ents:
label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
ent.merge(label, lemma, type_)
# check looping is ok
assert(len(tokens) == 13)
def test_sentence_update_after_merge(EN):
tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale')
sent1, sent2 = list(tokens.sents)
init_len = len(sent1)
merge_me = tokens[0:2]
merge_me.merge(u'none', u'none', u'none')
assert(len(sent1) == init_len - 1)
def test_subtree_size_check(EN):
tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
sent1 = list(tokens.sents)[0]
init_len = len(list(sent1.root.subtree))
merge_me = tokens[0:2]
merge_me.merge(u'none', u'none', u'none')
assert(len(list(sent1.root.subtree)) == init_len - 1)

View File

@ -459,7 +459,6 @@ cdef class Doc:
def range_from_indices(self, int start_idx, int end_idx):
""" Get tuple - span of token indices which correspond to
character indices (start_idx, end_idx) if such a span exists"""
assert start_idx < end_idx
cdef int i
cdef int start = -1
cdef int end = -1
@ -490,8 +489,6 @@ cdef class Doc:
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]
# Update fields
token.lex = lex
token.spacy = self.data[end-1].spacy
# What to do about morphology??
# TODO: token.morph = ???
@ -509,6 +506,10 @@ cdef class Doc:
# bridges over the entity. Here the alignment of the tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc
token.lex = lex
for i in range(self.length):
self.data[i].head += i
# Set the head of the merged token, and its dep relation, from the Span