changed start end to properties - allow merge

This commit is contained in:
Andreas Grivas 2015-11-03 17:13:42 +02:00
parent 7691737fad
commit c780bbda3e
4 changed files with 61 additions and 242 deletions

230
setup.py
View File

@ -1,230 +0,0 @@
#!/usr/bin/env python
from setuptools import setup
import shutil
import sys
import os
from os import path
from setuptools import Extension
from distutils import sysconfig
from distutils.core import setup, Extension
from distutils.command.build_ext import build_ext
import platform
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
compile_options = {'msvc' : ['/Ox', '/EHsc'] ,
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] }
link_options = {'msvc' : [] ,
'other' : [] }
class build_ext_options:
def build_options(self):
c_type = None
if self.compiler.compiler_type in compile_options:
c_type = self.compiler.compiler_type
elif 'other' in compile_options:
c_type = 'other'
if c_type is not None:
for e in self.extensions:
e.extra_compile_args = compile_options[c_type]
l_type = None
if self.compiler.compiler_type in link_options:
l_type = self.compiler.compiler_type
elif 'other' in link_options:
l_type = 'other'
if l_type is not None:
for e in self.extensions:
e.extra_link_args = link_options[l_type]
class build_ext_subclass( build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
build_ext.build_extensions(self)
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
# this is necessary to get it compile.
# We have to resort to monkey-patching to set the compiler, because pypy broke
# all the everything.
pre_patch_customize_compiler = sysconfig.customize_compiler
def my_customize_compiler(compiler):
pre_patch_customize_compiler(compiler)
compiler.compiler_cxx = ['c++']
if platform.python_implementation() == 'PyPy':
sysconfig.customize_compiler = my_customize_compiler
#def install_headers():
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
# if not path.exists(dest_dir):
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
#
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
includes = ['.', path.join(sys.prefix, 'include')]
try:
import numpy
numpy_headers = path.join(numpy.get_include(), 'numpy')
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
except ImportError:
pass
except OSError:
pass
def clean(mod_names):
for name in mod_names:
name = name.replace('.', '/')
so = name + '.so'
html = name + '.html'
cpp = name + '.cpp'
c = name + '.c'
for file_path in [so, html, cpp, c]:
if os.path.exists(file_path):
os.unlink(file_path)
def name_to_path(mod_name, ext):
return '%s.%s' % (mod_name.replace('.', '/'), ext)
def c_ext(mod_name, language, includes):
mod_path = name_to_path(mod_name, language)
return Extension(mod_name, [mod_path], include_dirs=includes)
def cython_setup(mod_names, language, includes):
import Cython.Distutils
import Cython.Build
import distutils.core
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
Cython.Distutils.build_ext.build_extensions(self)
if language == 'cpp':
language = 'c++'
exts = []
for mod_name in mod_names:
mod_path = mod_name.replace('.', '/') + '.pyx'
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
exts.append(e)
distutils.core.setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/tag_map.json",
"data/vocab/lexemes.bin",
"data/vocab/strings.json"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
license="MIT",
)
def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/lexemes.bin",
"data/vocab/serializer.json",
"data/vocab/oov_prob",
"data/vocab/strings.txt"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
license="MIT",
install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)
import headers_workaround
headers_workaround.fix_venv_pypy_include()
headers_workaround.install_headers('murmurhash')
headers_workaround.install_headers('numpy')
VERSION = '0.97'
def main(modules, is_pypy):
language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')]
if sys.platform.startswith('darwin'):
compile_options['other'].append('-mmacosx-version-min=10.8')
compile_options['other'].append('-stdlib=libc++')
link_options['other'].append('-lc++')
if use_cython:
cython_setup(modules, language, includes)
else:
exts = [c_ext(mn, language, includes)
for mn in modules]
run_setup(exts)
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':
if sys.argv[1] == 'clean':
clean(MOD_NAMES)
else:
use_cython = sys.argv[1] == 'build_ext'
main(MOD_NAMES, use_cython)

View File

@ -439,11 +439,23 @@ cdef class Doc:
keep_reading = False
yield n_bytes_str + data
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
def token_index_start(self, int start_idx):
cdef int i
for i in range(self.length):
if self.data[i].idx == start_idx:
return i
return None
def token_index_end(self, int end_idx):
cdef int i
for i in range(self.length):
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
return i + 1
return None
def range_from_indices(self, int start_idx, int end_idx):
assert start_idx < end_idx
cdef int i
cdef int start = -1
cdef int end = -1
@ -454,10 +466,18 @@ cdef class Doc:
if start == -1:
return None
end = i + 1
break
else:
return (start, end)
return None
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
start_end = self.range_from_indices(start_idx, end_idx)
if start_end is None:
return None
start, end = start_end
cdef Span span = self[start:end]
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])

View File

@ -4,8 +4,10 @@ from .doc cimport Doc
cdef class Span:
cdef readonly Doc doc
cdef public int i
cdef public int start
cdef public int end
cdef public int start_token
cdef public int end_token
cdef public int start_idx
cdef public int end_idx
cdef readonly int label
cdef public _vector

View File

@ -21,8 +21,11 @@ cdef class Span:
raise IndexError
self.doc = tokens
self.start = start
self.end = end
# keep char offsets - as these don't change when merging spans
self.start_token = start
self.start_idx = self.doc[start].idx
self.end_token = end
self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
self.label = label
self._vector = vector
self._vector_norm = vector_norm
@ -76,6 +79,30 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property start:
def __get__(self):
# if we haven't merged anything below check is false - so we get start token
if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
new_start = self.doc.token_index_start(self.start_idx)
if new_start is not None:
self.start_token = new_start
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.start_idx)
return self.start_token
property end:
def __get__(self):
# if we haven't merged anything we have fast access
if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
new_end = self.doc.token_index_end(self.end_idx)
if new_end is not None:
self.end_token = new_end
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.end_idx)
return self.end_token
property vector:
def __get__(self):
if self._vector is None: