From c780bbda3e6db605aaa465400eda1de85d6ae0ea Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Tue, 3 Nov 2015 17:13:42 +0200 Subject: [PATCH] changed start end to properties - allow merge --- setup.py | 230 ----------------------------------------- spacy/tokens/doc.pyx | 36 +++++-- spacy/tokens/spans.pxd | 6 +- spacy/tokens/spans.pyx | 31 +++++- 4 files changed, 61 insertions(+), 242 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 1d719c626..000000000 --- a/setup.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup -import shutil - -import sys -import os -from os import path - -from setuptools import Extension -from distutils import sysconfig -from distutils.core import setup, Extension -from distutils.command.build_ext import build_ext - -import platform - -# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options -# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used -compile_options = {'msvc' : ['/Ox', '/EHsc'] , - 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] } -link_options = {'msvc' : [] , - 'other' : [] } -class build_ext_options: - def build_options(self): - c_type = None - if self.compiler.compiler_type in compile_options: - c_type = self.compiler.compiler_type - elif 'other' in compile_options: - c_type = 'other' - if c_type is not None: - for e in self.extensions: - e.extra_compile_args = compile_options[c_type] - - l_type = None - if self.compiler.compiler_type in link_options: - l_type = self.compiler.compiler_type - elif 'other' in link_options: - l_type = 'other' - if l_type is not None: - for e in self.extensions: - e.extra_link_args = link_options[l_type] - -class build_ext_subclass( build_ext, build_ext_options ): - def build_extensions(self): - build_ext_options.build_options(self) - build_ext.build_extensions(self) - - - -# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But, -# this is necessary to get it compile. -# We have to resort to monkey-patching to set the compiler, because pypy broke -# all the everything. - -pre_patch_customize_compiler = sysconfig.customize_compiler -def my_customize_compiler(compiler): - pre_patch_customize_compiler(compiler) - compiler.compiler_cxx = ['c++'] - - -if platform.python_implementation() == 'PyPy': - sysconfig.customize_compiler = my_customize_compiler - -#def install_headers(): -# dest_dir = path.join(sys.prefix, 'include', 'murmurhash') -# if not path.exists(dest_dir): -# shutil.copytree('murmurhash/headers/murmurhash', dest_dir) -# -# dest_dir = path.join(sys.prefix, 'include', 'numpy') - - -includes = ['.', path.join(sys.prefix, 'include')] - - -try: - import numpy - numpy_headers = path.join(numpy.get_include(), 'numpy') - shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy')) -except ImportError: - pass -except OSError: - pass - - -def clean(mod_names): - for name in mod_names: - name = name.replace('.', '/') - so = name + '.so' - html = name + '.html' - cpp = name + '.cpp' - c = name + '.c' - for file_path in [so, html, cpp, c]: - if os.path.exists(file_path): - os.unlink(file_path) - - -def name_to_path(mod_name, ext): - return '%s.%s' % (mod_name.replace('.', '/'), ext) - - -def c_ext(mod_name, language, includes): - mod_path = name_to_path(mod_name, language) - return Extension(mod_name, [mod_path], include_dirs=includes) - - -def cython_setup(mod_names, language, includes): - import Cython.Distutils - import Cython.Build - import distutils.core - - class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ): - def build_extensions(self): - build_ext_options.build_options(self) - Cython.Distutils.build_ext.build_extensions(self) - - if language == 'cpp': - language = 'c++' - exts = [] - for mod_name in mod_names: - mod_path = mod_name.replace('.', '/') + '.pyx' - e = Extension(mod_name, [mod_path], language=language, include_dirs=includes) - exts.append(e) - distutils.core.setup( - name='spacy', - packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', - 'spacy.syntax', 'spacy.munge'], - description="Industrial-strength NLP", - author='Matthew Honnibal', - author_email='honnibal@gmail.com', - version=VERSION, - url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"], - "spacy.tokens": ["*.pxd"], - "spacy.serialize": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/tag_map.json", - "data/vocab/lexemes.bin", - "data/vocab/strings.json"], - "spacy.syntax": ["*.pxd"]}, - ext_modules=exts, - cmdclass={'build_ext': build_ext_cython_subclass}, - license="MIT", - ) - - -def run_setup(exts): - setup( - name='spacy', - packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', - 'spacy.syntax', 'spacy.munge', - 'spacy.tests', - 'spacy.tests.matcher', - 'spacy.tests.morphology', - 'spacy.tests.munge', - 'spacy.tests.parser', - 'spacy.tests.serialize', - 'spacy.tests.spans', - 'spacy.tests.tagger', - 'spacy.tests.tokenizer', - 'spacy.tests.tokens', - 'spacy.tests.vectors', - 'spacy.tests.vocab'], - description="Industrial-strength NLP", - author='Matthew Honnibal', - author_email='honnibal@gmail.com', - version=VERSION, - url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/lexemes.bin", - "data/vocab/serializer.json", - "data/vocab/oov_prob", - "data/vocab/strings.txt"], - "spacy.syntax": ["*.pxd"]}, - ext_modules=exts, - license="MIT", - install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', - 'thinc >= 3.4.1', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle'], - setup_requires=["headers_workaround"], - cmdclass = {'build_ext': build_ext_subclass }, - ) - - import headers_workaround - - headers_workaround.fix_venv_pypy_include() - headers_workaround.install_headers('murmurhash') - headers_workaround.install_headers('numpy') - - -VERSION = '0.97' -def main(modules, is_pypy): - language = "cpp" - includes = ['.', path.join(sys.prefix, 'include')] - if sys.platform.startswith('darwin'): - compile_options['other'].append('-mmacosx-version-min=10.8') - compile_options['other'].append('-stdlib=libc++') - link_options['other'].append('-lc++') - if use_cython: - cython_setup(modules, language, includes) - else: - exts = [c_ext(mn, language, includes) - for mn in modules] - run_setup(exts) - -MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', - 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', 'spacy.tagger', - 'spacy.syntax.stateclass', - 'spacy._ml', 'spacy._theano', - 'spacy.tokenizer', - 'spacy.syntax.parser', - 'spacy.syntax.transition_system', - 'spacy.syntax.arc_eager', - 'spacy.syntax._parse_features', - 'spacy.gold', 'spacy.orth', - 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', - 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', - 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner', - 'spacy.symbols'] - - -if __name__ == '__main__': - if sys.argv[1] == 'clean': - clean(MOD_NAMES) - else: - use_cython = sys.argv[1] == 'build_ext' - main(MOD_NAMES, use_cython) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..f1c8d2c71 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -439,11 +439,23 @@ cdef class Doc: keep_reading = False yield n_bytes_str + data - # This function is terrible --- need to fix this. - def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, - unicode ent_type): - """Merge a multi-word expression into a single token. Currently - experimental; API is likely to change.""" + + def token_index_start(self, int start_idx): + cdef int i + for i in range(self.length): + if self.data[i].idx == start_idx: + return i + return None + + def token_index_end(self, int end_idx): + cdef int i + for i in range(self.length): + if (self.data[i].idx + self.data[i].lex.length) == end_idx: + return i + 1 + return None + + def range_from_indices(self, int start_idx, int end_idx): + assert start_idx < end_idx cdef int i cdef int start = -1 cdef int end = -1 @@ -454,10 +466,18 @@ cdef class Doc: if start == -1: return None end = i + 1 - break - else: - return None + return (start, end) + return None + # This function is terrible --- need to fix this. + def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, + unicode ent_type): + """Merge a multi-word expression into a single token. Currently + experimental; API is likely to change.""" + start_end = self.range_from_indices(start_idx, end_idx) + if start_end is None: + return None + start, end = start_end cdef Span span = self[start:end] # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/spans.pxd index 54c0a3afb..bae9e4691 100644 --- a/spacy/tokens/spans.pxd +++ b/spacy/tokens/spans.pxd @@ -4,8 +4,10 @@ from .doc cimport Doc cdef class Span: cdef readonly Doc doc cdef public int i - cdef public int start - cdef public int end + cdef public int start_token + cdef public int end_token + cdef public int start_idx + cdef public int end_idx cdef readonly int label cdef public _vector diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 95b8e0de1..f4dcb15f0 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -21,8 +21,11 @@ cdef class Span: raise IndexError self.doc = tokens - self.start = start - self.end = end + # keep char offsets - as these don't change when merging spans + self.start_token = start + self.start_idx = self.doc[start].idx + self.end_token = end + self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1]) self.label = label self._vector = vector self._vector_norm = vector_norm @@ -76,6 +79,30 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + property start: + def __get__(self): + # if we haven't merged anything below check is false - so we get start token + if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx: + new_start = self.doc.token_index_start(self.start_idx) + if new_start is not None: + self.start_token = new_start + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.start_idx) + return self.start_token + + property end: + def __get__(self): + # if we haven't merged anything we have fast access + if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx: + new_end = self.doc.token_index_end(self.end_idx) + if new_end is not None: + self.end_token = new_end + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.end_idx) + return self.end_token + property vector: def __get__(self): if self._vector is None: