From 8edd58492e43088212dc9c32c7ed9883f0abad18 Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Mon, 2 Nov 2015 13:41:39 +0200 Subject: [PATCH 1/8] fixed unicode error in printing - added tests --- spacy/tests/print/test_print.py | 98 +++++++++++++++++++++++++++++++++ spacy/tokens/doc.pyx | 4 +- spacy/tokens/spans.pyx | 2 +- spacy/tokens/token.pyx | 4 +- 4 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/print/test_print.py diff --git a/spacy/tests/print/test_print.py b/spacy/tests/print/test_print.py new file mode 100644 index 000000000..744a813d6 --- /dev/null +++ b/spacy/tests/print/test_print.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +import pytest + + +def test_print_doc(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store') + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_doc(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store') + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") + + +def test_print_doc_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café') + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_doc_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café') + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") + + +def test_print_span(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store')[-3:] + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_span(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store')[-3:] + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") + + +def test_print_span_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café')[-3:] + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_span_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café')[-3:] + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") + + +def test_print_token(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store')[-1] + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_token(EN): + try: + doc = EN(u'I sat down for coffee at the coffee store')[-1] + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") + + +def test_print_token_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café')[-1] + print(doc) + except Exception: + pytest.fail("Printing failed") + + +def test_repr_token_unicode(EN): + try: + doc = EN(u'I sat down for coffee at the café')[-1] + print(repr(doc)) + except Exception: + pytest.fail("Printing failed") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 1626ebfc6..957bc59e6 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -118,10 +118,10 @@ cdef class Doc: return u''.join([t.string for t in self]) def __str__(self): - return u''.join([t.string for t in self]) + return u''.join([t.string for t in self]).encode('utf-8') def __repr__(self): - return u''.join([t.string for t in self]) + return u''.join([t.string for t in self]).encode('utf-8') def similarity(self, other): if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index e1b881f79..1f6b07636 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -50,7 +50,7 @@ cdef class Span: text = self.text_with_ws if self[-1].whitespace_: text = text[:-1] - return text + return text.encode('utf-8') def __getitem__(self, object i): if isinstance(i, slice): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index cce8eeeb4..02ef52d0c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -41,10 +41,10 @@ cdef class Token: return self.string def __str__(self): - return self.string + return self.string.encode('utf-8') def __repr__(self): - return self.string + return self.string.encode('utf-8') cpdef bint check_flag(self, attr_id_t flag_id) except -1: return Lexeme.c_check_flag(self.c.lex, flag_id) From b5ce7a6e96f46ab9fad485bd780ec93c00bd116c Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Mon, 2 Nov 2015 19:40:37 +0200 Subject: [PATCH 2/8] fix py3 incompatibility --- spacy/tokens/doc.pyx | 10 ++++++++-- spacy/tokens/spans.pyx | 8 ++++---- spacy/tokens/token.pyx | 10 ++++++++-- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 957bc59e6..01ccb4fd9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -7,6 +7,7 @@ import numpy.linalg import struct cimport numpy as np import math +import six from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -117,11 +118,16 @@ cdef class Doc: def __unicode__(self): return u''.join([t.string for t in self]) - def __str__(self): + def __bytes__(self): return u''.join([t.string for t in self]).encode('utf-8') + def __str__(self): + if six.PY3: + return self.__unicode__() + return self.__bytes__() + def __repr__(self): - return u''.join([t.string for t in self]).encode('utf-8') + return self.__str__() def similarity(self, other): if self.vector_norm == 0 or other.vector_norm == 0: diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 1f6b07636..95b8e0de1 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -4,6 +4,7 @@ import numpy import numpy.linalg cimport numpy as np import math +import six from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t @@ -47,10 +48,9 @@ cdef class Span: return self.end - self.start def __repr__(self): - text = self.text_with_ws - if self[-1].whitespace_: - text = text[:-1] - return text.encode('utf-8') + if six.PY3: + return self.text + return self.text.encode('utf-8') def __getitem__(self, object i): if isinstance(i, slice): diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 02ef52d0c..81b850285 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -6,6 +6,7 @@ cimport numpy as np np.import_array() import numpy +import six from ..lexeme cimport Lexeme @@ -40,11 +41,16 @@ cdef class Token: def __unicode__(self): return self.string - def __str__(self): + def __bytes__(self): return self.string.encode('utf-8') + def __str__(self): + if six.PY3: + return self.__unicode__() + return self.__bytes__() + def __repr__(self): - return self.string.encode('utf-8') + return self.__str__() cpdef bint check_flag(self, attr_id_t flag_id) except -1: return Lexeme.c_check_flag(self.c.lex, flag_id) From 52ede05f9de21147dd3e0525b2d2615e250e2411 Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Mon, 2 Nov 2015 19:41:30 +0200 Subject: [PATCH 3/8] fix naming --- spacy/tests/print/test_print.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/tests/print/test_print.py b/spacy/tests/print/test_print.py index 744a813d6..4740f44e6 100644 --- a/spacy/tests/print/test_print.py +++ b/spacy/tests/print/test_print.py @@ -36,63 +36,63 @@ def test_repr_doc_unicode(EN): def test_print_span(EN): try: - doc = EN(u'I sat down for coffee at the coffee store')[-3:] - print(doc) + span = EN(u'I sat down for coffee at the coffee store')[-3:] + print(span) except Exception: pytest.fail("Printing failed") def test_repr_span(EN): try: - doc = EN(u'I sat down for coffee at the coffee store')[-3:] - print(repr(doc)) + span = EN(u'I sat down for coffee at the coffee store')[-3:] + print(repr(span)) except Exception: pytest.fail("Printing failed") def test_print_span_unicode(EN): try: - doc = EN(u'I sat down for coffee at the café')[-3:] - print(doc) + span = EN(u'I sat down for coffee at the café')[-3:] + print(span) except Exception: pytest.fail("Printing failed") def test_repr_span_unicode(EN): try: - doc = EN(u'I sat down for coffee at the café')[-3:] - print(repr(doc)) + span = EN(u'I sat down for coffee at the café')[-3:] + print(repr(span)) except Exception: pytest.fail("Printing failed") def test_print_token(EN): try: - doc = EN(u'I sat down for coffee at the coffee store')[-1] - print(doc) + token = EN(u'I sat down for coffee at the coffee store')[-1] + print(token) except Exception: pytest.fail("Printing failed") def test_repr_token(EN): try: - doc = EN(u'I sat down for coffee at the coffee store')[-1] - print(repr(doc)) + token = EN(u'I sat down for coffee at the coffee store')[-1] + print(repr(token)) except Exception: pytest.fail("Printing failed") def test_print_token_unicode(EN): try: - doc = EN(u'I sat down for coffee at the café')[-1] - print(doc) + token = EN(u'I sat down for coffee at the café')[-1] + print(token) except Exception: pytest.fail("Printing failed") def test_repr_token_unicode(EN): try: - doc = EN(u'I sat down for coffee at the café')[-1] - print(repr(doc)) + token = EN(u'I sat down for coffee at the café')[-1] + print(repr(token)) except Exception: pytest.fail("Printing failed") From c780bbda3e6db605aaa465400eda1de85d6ae0ea Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Tue, 3 Nov 2015 17:13:42 +0200 Subject: [PATCH 4/8] changed start end to properties - allow merge --- setup.py | 230 ----------------------------------------- spacy/tokens/doc.pyx | 36 +++++-- spacy/tokens/spans.pxd | 6 +- spacy/tokens/spans.pyx | 31 +++++- 4 files changed, 61 insertions(+), 242 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 1d719c626..000000000 --- a/setup.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup -import shutil - -import sys -import os -from os import path - -from setuptools import Extension -from distutils import sysconfig -from distutils.core import setup, Extension -from distutils.command.build_ext import build_ext - -import platform - -# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options -# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used -compile_options = {'msvc' : ['/Ox', '/EHsc'] , - 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] } -link_options = {'msvc' : [] , - 'other' : [] } -class build_ext_options: - def build_options(self): - c_type = None - if self.compiler.compiler_type in compile_options: - c_type = self.compiler.compiler_type - elif 'other' in compile_options: - c_type = 'other' - if c_type is not None: - for e in self.extensions: - e.extra_compile_args = compile_options[c_type] - - l_type = None - if self.compiler.compiler_type in link_options: - l_type = self.compiler.compiler_type - elif 'other' in link_options: - l_type = 'other' - if l_type is not None: - for e in self.extensions: - e.extra_link_args = link_options[l_type] - -class build_ext_subclass( build_ext, build_ext_options ): - def build_extensions(self): - build_ext_options.build_options(self) - build_ext.build_extensions(self) - - - -# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But, -# this is necessary to get it compile. -# We have to resort to monkey-patching to set the compiler, because pypy broke -# all the everything. - -pre_patch_customize_compiler = sysconfig.customize_compiler -def my_customize_compiler(compiler): - pre_patch_customize_compiler(compiler) - compiler.compiler_cxx = ['c++'] - - -if platform.python_implementation() == 'PyPy': - sysconfig.customize_compiler = my_customize_compiler - -#def install_headers(): -# dest_dir = path.join(sys.prefix, 'include', 'murmurhash') -# if not path.exists(dest_dir): -# shutil.copytree('murmurhash/headers/murmurhash', dest_dir) -# -# dest_dir = path.join(sys.prefix, 'include', 'numpy') - - -includes = ['.', path.join(sys.prefix, 'include')] - - -try: - import numpy - numpy_headers = path.join(numpy.get_include(), 'numpy') - shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy')) -except ImportError: - pass -except OSError: - pass - - -def clean(mod_names): - for name in mod_names: - name = name.replace('.', '/') - so = name + '.so' - html = name + '.html' - cpp = name + '.cpp' - c = name + '.c' - for file_path in [so, html, cpp, c]: - if os.path.exists(file_path): - os.unlink(file_path) - - -def name_to_path(mod_name, ext): - return '%s.%s' % (mod_name.replace('.', '/'), ext) - - -def c_ext(mod_name, language, includes): - mod_path = name_to_path(mod_name, language) - return Extension(mod_name, [mod_path], include_dirs=includes) - - -def cython_setup(mod_names, language, includes): - import Cython.Distutils - import Cython.Build - import distutils.core - - class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ): - def build_extensions(self): - build_ext_options.build_options(self) - Cython.Distutils.build_ext.build_extensions(self) - - if language == 'cpp': - language = 'c++' - exts = [] - for mod_name in mod_names: - mod_path = mod_name.replace('.', '/') + '.pyx' - e = Extension(mod_name, [mod_path], language=language, include_dirs=includes) - exts.append(e) - distutils.core.setup( - name='spacy', - packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', - 'spacy.syntax', 'spacy.munge'], - description="Industrial-strength NLP", - author='Matthew Honnibal', - author_email='honnibal@gmail.com', - version=VERSION, - url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"], - "spacy.tokens": ["*.pxd"], - "spacy.serialize": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/tag_map.json", - "data/vocab/lexemes.bin", - "data/vocab/strings.json"], - "spacy.syntax": ["*.pxd"]}, - ext_modules=exts, - cmdclass={'build_ext': build_ext_cython_subclass}, - license="MIT", - ) - - -def run_setup(exts): - setup( - name='spacy', - packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', - 'spacy.syntax', 'spacy.munge', - 'spacy.tests', - 'spacy.tests.matcher', - 'spacy.tests.morphology', - 'spacy.tests.munge', - 'spacy.tests.parser', - 'spacy.tests.serialize', - 'spacy.tests.spans', - 'spacy.tests.tagger', - 'spacy.tests.tokenizer', - 'spacy.tests.tokens', - 'spacy.tests.vectors', - 'spacy.tests.vocab'], - description="Industrial-strength NLP", - author='Matthew Honnibal', - author_email='honnibal@gmail.com', - version=VERSION, - url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/lexemes.bin", - "data/vocab/serializer.json", - "data/vocab/oov_prob", - "data/vocab/strings.txt"], - "spacy.syntax": ["*.pxd"]}, - ext_modules=exts, - license="MIT", - install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', - 'thinc >= 3.4.1', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle'], - setup_requires=["headers_workaround"], - cmdclass = {'build_ext': build_ext_subclass }, - ) - - import headers_workaround - - headers_workaround.fix_venv_pypy_include() - headers_workaround.install_headers('murmurhash') - headers_workaround.install_headers('numpy') - - -VERSION = '0.97' -def main(modules, is_pypy): - language = "cpp" - includes = ['.', path.join(sys.prefix, 'include')] - if sys.platform.startswith('darwin'): - compile_options['other'].append('-mmacosx-version-min=10.8') - compile_options['other'].append('-stdlib=libc++') - link_options['other'].append('-lc++') - if use_cython: - cython_setup(modules, language, includes) - else: - exts = [c_ext(mn, language, includes) - for mn in modules] - run_setup(exts) - -MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', - 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', 'spacy.tagger', - 'spacy.syntax.stateclass', - 'spacy._ml', 'spacy._theano', - 'spacy.tokenizer', - 'spacy.syntax.parser', - 'spacy.syntax.transition_system', - 'spacy.syntax.arc_eager', - 'spacy.syntax._parse_features', - 'spacy.gold', 'spacy.orth', - 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', - 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', - 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner', - 'spacy.symbols'] - - -if __name__ == '__main__': - if sys.argv[1] == 'clean': - clean(MOD_NAMES) - else: - use_cython = sys.argv[1] == 'build_ext' - main(MOD_NAMES, use_cython) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..f1c8d2c71 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -439,11 +439,23 @@ cdef class Doc: keep_reading = False yield n_bytes_str + data - # This function is terrible --- need to fix this. - def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, - unicode ent_type): - """Merge a multi-word expression into a single token. Currently - experimental; API is likely to change.""" + + def token_index_start(self, int start_idx): + cdef int i + for i in range(self.length): + if self.data[i].idx == start_idx: + return i + return None + + def token_index_end(self, int end_idx): + cdef int i + for i in range(self.length): + if (self.data[i].idx + self.data[i].lex.length) == end_idx: + return i + 1 + return None + + def range_from_indices(self, int start_idx, int end_idx): + assert start_idx < end_idx cdef int i cdef int start = -1 cdef int end = -1 @@ -454,10 +466,18 @@ cdef class Doc: if start == -1: return None end = i + 1 - break - else: - return None + return (start, end) + return None + # This function is terrible --- need to fix this. + def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, + unicode ent_type): + """Merge a multi-word expression into a single token. Currently + experimental; API is likely to change.""" + start_end = self.range_from_indices(start_idx, end_idx) + if start_end is None: + return None + start, end = start_end cdef Span span = self[start:end] # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/spans.pxd index 54c0a3afb..bae9e4691 100644 --- a/spacy/tokens/spans.pxd +++ b/spacy/tokens/spans.pxd @@ -4,8 +4,10 @@ from .doc cimport Doc cdef class Span: cdef readonly Doc doc cdef public int i - cdef public int start - cdef public int end + cdef public int start_token + cdef public int end_token + cdef public int start_idx + cdef public int end_idx cdef readonly int label cdef public _vector diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 95b8e0de1..f4dcb15f0 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -21,8 +21,11 @@ cdef class Span: raise IndexError self.doc = tokens - self.start = start - self.end = end + # keep char offsets - as these don't change when merging spans + self.start_token = start + self.start_idx = self.doc[start].idx + self.end_token = end + self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1]) self.label = label self._vector = vector self._vector_norm = vector_norm @@ -76,6 +79,30 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + property start: + def __get__(self): + # if we haven't merged anything below check is false - so we get start token + if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx: + new_start = self.doc.token_index_start(self.start_idx) + if new_start is not None: + self.start_token = new_start + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.start_idx) + return self.start_token + + property end: + def __get__(self): + # if we haven't merged anything we have fast access + if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx: + new_end = self.doc.token_index_end(self.end_idx) + if new_end is not None: + self.end_token = new_end + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.end_idx) + return self.end_token + property vector: def __get__(self): if self._vector is None: From 015a84a5ecf72ae6703046e2fc85f3b7bc58ad2a Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Wed, 4 Nov 2015 12:56:07 +0200 Subject: [PATCH 5/8] added comments --- spacy/tokens/doc.pyx | 4 ++++ spacy/tokens/spans.pyx | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f1c8d2c71..d11054e35 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -441,6 +441,7 @@ cdef class Doc: def token_index_start(self, int start_idx): + """ Get index of token in doc that has character index start_idx """ cdef int i for i in range(self.length): if self.data[i].idx == start_idx: @@ -448,6 +449,7 @@ cdef class Doc: return None def token_index_end(self, int end_idx): + """ Get index+1 of token in doc ending with character index end_idx """ cdef int i for i in range(self.length): if (self.data[i].idx + self.data[i].lex.length) == end_idx: @@ -455,6 +457,8 @@ cdef class Doc: return None def range_from_indices(self, int start_idx, int end_idx): + """ Get tuple - span of token indices which correspond to + character indices (start_idx, end_idx) if such a span exists""" assert start_idx < end_idx cdef int i cdef int start = -1 diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index f4dcb15f0..afd809ecf 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -14,14 +14,15 @@ from ..util import normalize_slice cdef class Span: - """A slice from a Doc object.""" + """A slice from a Doc object. Internally keeps character offsets in order + to keep track of changes (merges) in the original Doc. Updates are + made in start and end property.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): if not (0 <= start <= end <= len(tokens)): raise IndexError self.doc = tokens - # keep char offsets - as these don't change when merging spans self.start_token = start self.start_idx = self.doc[start].idx self.end_token = end @@ -80,9 +81,14 @@ cdef class Span: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property start: + """ Get start token index of this span from the Doc.""" def __get__(self): - # if we haven't merged anything below check is false - so we get start token + # if we have merged spans in Doc start might have changed. + # check if token start index is in doc index range and the token + # index is start_idx (it hasn't changed). + # Potential IndexError if only second condition was used if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx: + # go through tokens in Doc - find index of token equal to start_idx new_start = self.doc.token_index_start(self.start_idx) if new_start is not None: self.start_token = new_start @@ -92,9 +98,14 @@ cdef class Span: return self.start_token property end: + """ Get end token index of this span from the Doc.""" def __get__(self): - # if we haven't merged anything we have fast access + # if we have merged spans in Doc end will have changed. + # check if token end index is in doc index range and the token + # index is end_idx (it hasn't changed). + # Potential IndexError if only second condition was used if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx: + # go through tokens in Doc - find index of token equal to end_idx new_end = self.doc.token_index_end(self.end_idx) if new_end is not None: self.end_token = new_end From 93918b5c234f2111fae0466c4e1498f373c87ab7 Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Wed, 4 Nov 2015 19:49:22 +0200 Subject: [PATCH 6/8] assign lex after spans, add tests --- setup.py | 230 ++++++++++++++++++++++++++++++++ spacy/tests/spans/test_merge.py | 27 +++- spacy/tokens/doc.pyx | 7 +- 3 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..1d719c626 --- /dev/null +++ b/setup.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +from setuptools import setup +import shutil + +import sys +import os +from os import path + +from setuptools import Extension +from distutils import sysconfig +from distutils.core import setup, Extension +from distutils.command.build_ext import build_ext + +import platform + +# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options +# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used +compile_options = {'msvc' : ['/Ox', '/EHsc'] , + 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] } +link_options = {'msvc' : [] , + 'other' : [] } +class build_ext_options: + def build_options(self): + c_type = None + if self.compiler.compiler_type in compile_options: + c_type = self.compiler.compiler_type + elif 'other' in compile_options: + c_type = 'other' + if c_type is not None: + for e in self.extensions: + e.extra_compile_args = compile_options[c_type] + + l_type = None + if self.compiler.compiler_type in link_options: + l_type = self.compiler.compiler_type + elif 'other' in link_options: + l_type = 'other' + if l_type is not None: + for e in self.extensions: + e.extra_link_args = link_options[l_type] + +class build_ext_subclass( build_ext, build_ext_options ): + def build_extensions(self): + build_ext_options.build_options(self) + build_ext.build_extensions(self) + + + +# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But, +# this is necessary to get it compile. +# We have to resort to monkey-patching to set the compiler, because pypy broke +# all the everything. + +pre_patch_customize_compiler = sysconfig.customize_compiler +def my_customize_compiler(compiler): + pre_patch_customize_compiler(compiler) + compiler.compiler_cxx = ['c++'] + + +if platform.python_implementation() == 'PyPy': + sysconfig.customize_compiler = my_customize_compiler + +#def install_headers(): +# dest_dir = path.join(sys.prefix, 'include', 'murmurhash') +# if not path.exists(dest_dir): +# shutil.copytree('murmurhash/headers/murmurhash', dest_dir) +# +# dest_dir = path.join(sys.prefix, 'include', 'numpy') + + +includes = ['.', path.join(sys.prefix, 'include')] + + +try: + import numpy + numpy_headers = path.join(numpy.get_include(), 'numpy') + shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy')) +except ImportError: + pass +except OSError: + pass + + +def clean(mod_names): + for name in mod_names: + name = name.replace('.', '/') + so = name + '.so' + html = name + '.html' + cpp = name + '.cpp' + c = name + '.c' + for file_path in [so, html, cpp, c]: + if os.path.exists(file_path): + os.unlink(file_path) + + +def name_to_path(mod_name, ext): + return '%s.%s' % (mod_name.replace('.', '/'), ext) + + +def c_ext(mod_name, language, includes): + mod_path = name_to_path(mod_name, language) + return Extension(mod_name, [mod_path], include_dirs=includes) + + +def cython_setup(mod_names, language, includes): + import Cython.Distutils + import Cython.Build + import distutils.core + + class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ): + def build_extensions(self): + build_ext_options.build_options(self) + Cython.Distutils.build_ext.build_extensions(self) + + if language == 'cpp': + language = 'c++' + exts = [] + for mod_name in mod_names: + mod_path = mod_name.replace('.', '/') + '.pyx' + e = Extension(mod_name, [mod_path], language=language, include_dirs=includes) + exts.append(e) + distutils.core.setup( + name='spacy', + packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', + 'spacy.syntax', 'spacy.munge'], + description="Industrial-strength NLP", + author='Matthew Honnibal', + author_email='honnibal@gmail.com', + version=VERSION, + url="http://honnibal.github.io/spaCy/", + package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"], + "spacy.tokens": ["*.pxd"], + "spacy.serialize": ["*.pxd"], + "spacy.en": ["*.pxd", "data/pos/*", + "data/wordnet/*", "data/tokenizer/*", + "data/vocab/tag_map.json", + "data/vocab/lexemes.bin", + "data/vocab/strings.json"], + "spacy.syntax": ["*.pxd"]}, + ext_modules=exts, + cmdclass={'build_ext': build_ext_cython_subclass}, + license="MIT", + ) + + +def run_setup(exts): + setup( + name='spacy', + packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', + 'spacy.syntax', 'spacy.munge', + 'spacy.tests', + 'spacy.tests.matcher', + 'spacy.tests.morphology', + 'spacy.tests.munge', + 'spacy.tests.parser', + 'spacy.tests.serialize', + 'spacy.tests.spans', + 'spacy.tests.tagger', + 'spacy.tests.tokenizer', + 'spacy.tests.tokens', + 'spacy.tests.vectors', + 'spacy.tests.vocab'], + description="Industrial-strength NLP", + author='Matthew Honnibal', + author_email='honnibal@gmail.com', + version=VERSION, + url="http://honnibal.github.io/spaCy/", + package_data={"spacy": ["*.pxd"], + "spacy.en": ["*.pxd", "data/pos/*", + "data/wordnet/*", "data/tokenizer/*", + "data/vocab/lexemes.bin", + "data/vocab/serializer.json", + "data/vocab/oov_prob", + "data/vocab/strings.txt"], + "spacy.syntax": ["*.pxd"]}, + ext_modules=exts, + license="MIT", + install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', + 'thinc >= 3.4.1', "text_unidecode", 'plac', 'six', + 'ujson', 'cloudpickle'], + setup_requires=["headers_workaround"], + cmdclass = {'build_ext': build_ext_subclass }, + ) + + import headers_workaround + + headers_workaround.fix_venv_pypy_include() + headers_workaround.install_headers('murmurhash') + headers_workaround.install_headers('numpy') + + +VERSION = '0.97' +def main(modules, is_pypy): + language = "cpp" + includes = ['.', path.join(sys.prefix, 'include')] + if sys.platform.startswith('darwin'): + compile_options['other'].append('-mmacosx-version-min=10.8') + compile_options['other'].append('-stdlib=libc++') + link_options['other'].append('-lc++') + if use_cython: + cython_setup(modules, language, includes) + else: + exts = [c_ext(mn, language, includes) + for mn in modules] + run_setup(exts) + +MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', + 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', + 'spacy.morphology', 'spacy.tagger', + 'spacy.syntax.stateclass', + 'spacy._ml', 'spacy._theano', + 'spacy.tokenizer', + 'spacy.syntax.parser', + 'spacy.syntax.transition_system', + 'spacy.syntax.arc_eager', + 'spacy.syntax._parse_features', + 'spacy.gold', 'spacy.orth', + 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', + 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', + 'spacy.cfile', 'spacy.matcher', + 'spacy.syntax.ner', + 'spacy.symbols'] + + +if __name__ == '__main__': + if sys.argv[1] == 'clean': + clean(MOD_NAMES) + else: + use_cython = sys.argv[1] == 'build_ext' + main(MOD_NAMES, use_cython) diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py index 2360a0839..315757a0b 100644 --- a/spacy/tests/spans/test_merge.py +++ b/spacy/tests/spans/test_merge.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models def test_merge_tokens(EN): tokens = EN(u'Los Angeles start.') assert len(tokens) == 4 @@ -13,7 +12,6 @@ def test_merge_tokens(EN): assert tokens[0].head.orth_ == 'start' -@pytest.mark.models def test_merge_heads(EN): tokens = EN(u'I found a pilates class near work.') assert len(tokens) == 8 @@ -32,7 +30,6 @@ def test_issue_54(EN): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = EN(text) -@pytest.mark.models def test_np_merges(EN): text = u'displaCy is a parse tool built with Javascript' tokens = EN(text) @@ -47,3 +44,27 @@ def test_np_merges(EN): merged = tokens.merge(start, end, label, lemma, label) assert merged != None, (start, end, label, lemma) +def test_entity_merge(EN): + tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale') + assert(len(tokens) == 15) + for ent in tokens.ents: + label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent)) + ent.merge(label, lemma, type_) + # check looping is ok + assert(len(tokens) == 13) + +def test_sentence_update_after_merge(EN): + tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale') + sent1, sent2 = list(tokens.sents) + init_len = len(sent1) + merge_me = tokens[0:2] + merge_me.merge(u'none', u'none', u'none') + assert(len(sent1) == init_len - 1) + +def test_subtree_size_check(EN): + tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale') + sent1 = list(tokens.sents)[0] + init_len = len(list(sent1.root.subtree)) + merge_me = tokens[0:2] + merge_me.merge(u'none', u'none', u'none') + assert(len(list(sent1.root.subtree)) == init_len - 1) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d11054e35..555528a33 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -459,7 +459,6 @@ cdef class Doc: def range_from_indices(self, int start_idx, int end_idx): """ Get tuple - span of token indices which correspond to character indices (start_idx, end_idx) if such a span exists""" - assert start_idx < end_idx cdef int i cdef int start = -1 cdef int end = -1 @@ -490,8 +489,6 @@ cdef class Doc: cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] - # Update fields - token.lex = lex token.spacy = self.data[end-1].spacy # What to do about morphology?? # TODO: token.morph = ??? @@ -509,6 +506,10 @@ cdef class Doc: # bridges over the entity. Here the alignment of the tokens changes. span_root = span.root.i token.dep = span.root.dep + # We update token.lex after keeping span root and dep, since + # setting token.lex will change span.start and span.end properties + # as it modifies the character offsets in the doc + token.lex = lex for i in range(self.length): self.data[i].head += i # Set the head of the merged token, and its dep relation, from the Span From 770e3637ff5308fef403279d06959cf30b7fb435 Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Wed, 4 Nov 2015 20:20:42 +0200 Subject: [PATCH 7/8] update data -> c :) --- spacy/tokens/doc.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c7974bf31..5549d78d4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -443,7 +443,7 @@ cdef class Doc: """ Get index of token in doc that has character index start_idx """ cdef int i for i in range(self.length): - if self.data[i].idx == start_idx: + if self.c[i].idx == start_idx: return i return None @@ -451,7 +451,7 @@ cdef class Doc: """ Get index+1 of token in doc ending with character index end_idx """ cdef int i for i in range(self.length): - if (self.data[i].idx + self.data[i].lex.length) == end_idx: + if (self.c[i].idx + self.c[i].lex.length) == end_idx: return i + 1 return None From 9fa35951ab7b4e6bbc8f1be40ae30517d4c1bc64 Mon Sep 17 00:00:00 2001 From: Andreas Grivas Date: Wed, 4 Nov 2015 20:21:25 +0200 Subject: [PATCH 8/8] install error test/span -> test/spans --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d153ae3da..e8715eaef 100644 --- a/setup.py +++ b/setup.py @@ -163,7 +163,7 @@ def run_setup(exts): 'spacy.tests.munge', 'spacy.tests.parser', 'spacy.tests.serialize', - 'spacy.tests.span', + 'spacy.tests.spans', 'spacy.tests.tagger', 'spacy.tests.tokenizer', 'spacy.tests.tokens',