changed start end to properties - allow merge

This commit is contained in:
Andreas Grivas 2015-11-03 17:13:42 +02:00
parent 7691737fad
commit c780bbda3e
4 changed files with 61 additions and 242 deletions

230
setup.py
View File

@ -1,230 +0,0 @@
#!/usr/bin/env python
from setuptools import setup
import shutil
import sys
import os
from os import path
from setuptools import Extension
from distutils import sysconfig
from distutils.core import setup, Extension
from distutils.command.build_ext import build_ext
import platform
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
compile_options = {'msvc' : ['/Ox', '/EHsc'] ,
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] }
link_options = {'msvc' : [] ,
'other' : [] }
class build_ext_options:
def build_options(self):
c_type = None
if self.compiler.compiler_type in compile_options:
c_type = self.compiler.compiler_type
elif 'other' in compile_options:
c_type = 'other'
if c_type is not None:
for e in self.extensions:
e.extra_compile_args = compile_options[c_type]
l_type = None
if self.compiler.compiler_type in link_options:
l_type = self.compiler.compiler_type
elif 'other' in link_options:
l_type = 'other'
if l_type is not None:
for e in self.extensions:
e.extra_link_args = link_options[l_type]
class build_ext_subclass( build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
build_ext.build_extensions(self)
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
# this is necessary to get it compile.
# We have to resort to monkey-patching to set the compiler, because pypy broke
# all the everything.
pre_patch_customize_compiler = sysconfig.customize_compiler
def my_customize_compiler(compiler):
pre_patch_customize_compiler(compiler)
compiler.compiler_cxx = ['c++']
if platform.python_implementation() == 'PyPy':
sysconfig.customize_compiler = my_customize_compiler
#def install_headers():
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
# if not path.exists(dest_dir):
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
#
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
includes = ['.', path.join(sys.prefix, 'include')]
try:
import numpy
numpy_headers = path.join(numpy.get_include(), 'numpy')
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
except ImportError:
pass
except OSError:
pass
def clean(mod_names):
for name in mod_names:
name = name.replace('.', '/')
so = name + '.so'
html = name + '.html'
cpp = name + '.cpp'
c = name + '.c'
for file_path in [so, html, cpp, c]:
if os.path.exists(file_path):
os.unlink(file_path)
def name_to_path(mod_name, ext):
return '%s.%s' % (mod_name.replace('.', '/'), ext)
def c_ext(mod_name, language, includes):
mod_path = name_to_path(mod_name, language)
return Extension(mod_name, [mod_path], include_dirs=includes)
def cython_setup(mod_names, language, includes):
import Cython.Distutils
import Cython.Build
import distutils.core
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
def build_extensions(self):
build_ext_options.build_options(self)
Cython.Distutils.build_ext.build_extensions(self)
if language == 'cpp':
language = 'c++'
exts = []
for mod_name in mod_names:
mod_path = mod_name.replace('.', '/') + '.pyx'
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
exts.append(e)
distutils.core.setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
"spacy.tokens": ["*.pxd"],
"spacy.serialize": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/tag_map.json",
"data/vocab/lexemes.bin",
"data/vocab/strings.json"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
license="MIT",
)
def run_setup(exts):
setup(
name='spacy',
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
'spacy.syntax', 'spacy.munge',
'spacy.tests',
'spacy.tests.matcher',
'spacy.tests.morphology',
'spacy.tests.munge',
'spacy.tests.parser',
'spacy.tests.serialize',
'spacy.tests.spans',
'spacy.tests.tagger',
'spacy.tests.tokenizer',
'spacy.tests.tokens',
'spacy.tests.vectors',
'spacy.tests.vocab'],
description="Industrial-strength NLP",
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
package_data={"spacy": ["*.pxd"],
"spacy.en": ["*.pxd", "data/pos/*",
"data/wordnet/*", "data/tokenizer/*",
"data/vocab/lexemes.bin",
"data/vocab/serializer.json",
"data/vocab/oov_prob",
"data/vocab/strings.txt"],
"spacy.syntax": ["*.pxd"]},
ext_modules=exts,
license="MIT",
install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)
import headers_workaround
headers_workaround.fix_venv_pypy_include()
headers_workaround.install_headers('murmurhash')
headers_workaround.install_headers('numpy')
VERSION = '0.97'
def main(modules, is_pypy):
language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')]
if sys.platform.startswith('darwin'):
compile_options['other'].append('-mmacosx-version-min=10.8')
compile_options['other'].append('-stdlib=libc++')
link_options['other'].append('-lc++')
if use_cython:
cython_setup(modules, language, includes)
else:
exts = [c_ext(mn, language, includes)
for mn in modules]
run_setup(exts)
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
'spacy.morphology', 'spacy.tagger',
'spacy.syntax.stateclass',
'spacy._ml', 'spacy._theano',
'spacy.tokenizer',
'spacy.syntax.parser',
'spacy.syntax.transition_system',
'spacy.syntax.arc_eager',
'spacy.syntax._parse_features',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile', 'spacy.matcher',
'spacy.syntax.ner',
'spacy.symbols']
if __name__ == '__main__':
if sys.argv[1] == 'clean':
clean(MOD_NAMES)
else:
use_cython = sys.argv[1] == 'build_ext'
main(MOD_NAMES, use_cython)

View File

@ -439,11 +439,23 @@ cdef class Doc:
keep_reading = False keep_reading = False
yield n_bytes_str + data yield n_bytes_str + data
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, def token_index_start(self, int start_idx):
unicode ent_type): cdef int i
"""Merge a multi-word expression into a single token. Currently for i in range(self.length):
experimental; API is likely to change.""" if self.data[i].idx == start_idx:
return i
return None
def token_index_end(self, int end_idx):
cdef int i
for i in range(self.length):
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
return i + 1
return None
def range_from_indices(self, int start_idx, int end_idx):
assert start_idx < end_idx
cdef int i cdef int i
cdef int start = -1 cdef int start = -1
cdef int end = -1 cdef int end = -1
@ -454,10 +466,18 @@ cdef class Doc:
if start == -1: if start == -1:
return None return None
end = i + 1 end = i + 1
break return (start, end)
else: return None
return None
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
start_end = self.range_from_indices(start_idx, end_idx)
if start_end is None:
return None
start, end = start_end
cdef Span span = self[start:end] cdef Span span = self[start:end]
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span]) new_orth = ''.join([t.text_with_ws for t in span])

View File

@ -4,8 +4,10 @@ from .doc cimport Doc
cdef class Span: cdef class Span:
cdef readonly Doc doc cdef readonly Doc doc
cdef public int i cdef public int i
cdef public int start cdef public int start_token
cdef public int end cdef public int end_token
cdef public int start_idx
cdef public int end_idx
cdef readonly int label cdef readonly int label
cdef public _vector cdef public _vector

View File

@ -21,8 +21,11 @@ cdef class Span:
raise IndexError raise IndexError
self.doc = tokens self.doc = tokens
self.start = start # keep char offsets - as these don't change when merging spans
self.end = end self.start_token = start
self.start_idx = self.doc[start].idx
self.end_token = end
self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
self.label = label self.label = label
self._vector = vector self._vector = vector
self._vector_norm = vector_norm self._vector_norm = vector_norm
@ -76,6 +79,30 @@ cdef class Span:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property start:
def __get__(self):
# if we haven't merged anything below check is false - so we get start token
if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
new_start = self.doc.token_index_start(self.start_idx)
if new_start is not None:
self.start_token = new_start
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.start_idx)
return self.start_token
property end:
def __get__(self):
# if we haven't merged anything we have fast access
if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
new_end = self.doc.token_index_end(self.end_idx)
if new_end is not None:
self.end_token = new_end
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.end_idx)
return self.end_token
property vector: property vector:
def __get__(self): def __get__(self):
if self._vector is None: if self._vector is None: