mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 05:40:20 +03:00
changed start end to properties - allow merge
This commit is contained in:
parent
7691737fad
commit
c780bbda3e
230
setup.py
230
setup.py
|
@ -1,230 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
from setuptools import setup
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
from setuptools import Extension
|
|
||||||
from distutils import sysconfig
|
|
||||||
from distutils.core import setup, Extension
|
|
||||||
from distutils.command.build_ext import build_ext
|
|
||||||
|
|
||||||
import platform
|
|
||||||
|
|
||||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
|
||||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
|
||||||
compile_options = {'msvc' : ['/Ox', '/EHsc'] ,
|
|
||||||
'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] }
|
|
||||||
link_options = {'msvc' : [] ,
|
|
||||||
'other' : [] }
|
|
||||||
class build_ext_options:
|
|
||||||
def build_options(self):
|
|
||||||
c_type = None
|
|
||||||
if self.compiler.compiler_type in compile_options:
|
|
||||||
c_type = self.compiler.compiler_type
|
|
||||||
elif 'other' in compile_options:
|
|
||||||
c_type = 'other'
|
|
||||||
if c_type is not None:
|
|
||||||
for e in self.extensions:
|
|
||||||
e.extra_compile_args = compile_options[c_type]
|
|
||||||
|
|
||||||
l_type = None
|
|
||||||
if self.compiler.compiler_type in link_options:
|
|
||||||
l_type = self.compiler.compiler_type
|
|
||||||
elif 'other' in link_options:
|
|
||||||
l_type = 'other'
|
|
||||||
if l_type is not None:
|
|
||||||
for e in self.extensions:
|
|
||||||
e.extra_link_args = link_options[l_type]
|
|
||||||
|
|
||||||
class build_ext_subclass( build_ext, build_ext_options ):
|
|
||||||
def build_extensions(self):
|
|
||||||
build_ext_options.build_options(self)
|
|
||||||
build_ext.build_extensions(self)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
|
|
||||||
# this is necessary to get it compile.
|
|
||||||
# We have to resort to monkey-patching to set the compiler, because pypy broke
|
|
||||||
# all the everything.
|
|
||||||
|
|
||||||
pre_patch_customize_compiler = sysconfig.customize_compiler
|
|
||||||
def my_customize_compiler(compiler):
|
|
||||||
pre_patch_customize_compiler(compiler)
|
|
||||||
compiler.compiler_cxx = ['c++']
|
|
||||||
|
|
||||||
|
|
||||||
if platform.python_implementation() == 'PyPy':
|
|
||||||
sysconfig.customize_compiler = my_customize_compiler
|
|
||||||
|
|
||||||
#def install_headers():
|
|
||||||
# dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
|
|
||||||
# if not path.exists(dest_dir):
|
|
||||||
# shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
|
|
||||||
#
|
|
||||||
# dest_dir = path.join(sys.prefix, 'include', 'numpy')
|
|
||||||
|
|
||||||
|
|
||||||
includes = ['.', path.join(sys.prefix, 'include')]
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import numpy
|
|
||||||
numpy_headers = path.join(numpy.get_include(), 'numpy')
|
|
||||||
shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def clean(mod_names):
|
|
||||||
for name in mod_names:
|
|
||||||
name = name.replace('.', '/')
|
|
||||||
so = name + '.so'
|
|
||||||
html = name + '.html'
|
|
||||||
cpp = name + '.cpp'
|
|
||||||
c = name + '.c'
|
|
||||||
for file_path in [so, html, cpp, c]:
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
os.unlink(file_path)
|
|
||||||
|
|
||||||
|
|
||||||
def name_to_path(mod_name, ext):
|
|
||||||
return '%s.%s' % (mod_name.replace('.', '/'), ext)
|
|
||||||
|
|
||||||
|
|
||||||
def c_ext(mod_name, language, includes):
|
|
||||||
mod_path = name_to_path(mod_name, language)
|
|
||||||
return Extension(mod_name, [mod_path], include_dirs=includes)
|
|
||||||
|
|
||||||
|
|
||||||
def cython_setup(mod_names, language, includes):
|
|
||||||
import Cython.Distutils
|
|
||||||
import Cython.Build
|
|
||||||
import distutils.core
|
|
||||||
|
|
||||||
class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
|
|
||||||
def build_extensions(self):
|
|
||||||
build_ext_options.build_options(self)
|
|
||||||
Cython.Distutils.build_ext.build_extensions(self)
|
|
||||||
|
|
||||||
if language == 'cpp':
|
|
||||||
language = 'c++'
|
|
||||||
exts = []
|
|
||||||
for mod_name in mod_names:
|
|
||||||
mod_path = mod_name.replace('.', '/') + '.pyx'
|
|
||||||
e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
|
|
||||||
exts.append(e)
|
|
||||||
distutils.core.setup(
|
|
||||||
name='spacy',
|
|
||||||
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
|
|
||||||
'spacy.syntax', 'spacy.munge'],
|
|
||||||
description="Industrial-strength NLP",
|
|
||||||
author='Matthew Honnibal',
|
|
||||||
author_email='honnibal@gmail.com',
|
|
||||||
version=VERSION,
|
|
||||||
url="http://honnibal.github.io/spaCy/",
|
|
||||||
package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
|
|
||||||
"spacy.tokens": ["*.pxd"],
|
|
||||||
"spacy.serialize": ["*.pxd"],
|
|
||||||
"spacy.en": ["*.pxd", "data/pos/*",
|
|
||||||
"data/wordnet/*", "data/tokenizer/*",
|
|
||||||
"data/vocab/tag_map.json",
|
|
||||||
"data/vocab/lexemes.bin",
|
|
||||||
"data/vocab/strings.json"],
|
|
||||||
"spacy.syntax": ["*.pxd"]},
|
|
||||||
ext_modules=exts,
|
|
||||||
cmdclass={'build_ext': build_ext_cython_subclass},
|
|
||||||
license="MIT",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_setup(exts):
|
|
||||||
setup(
|
|
||||||
name='spacy',
|
|
||||||
packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
|
|
||||||
'spacy.syntax', 'spacy.munge',
|
|
||||||
'spacy.tests',
|
|
||||||
'spacy.tests.matcher',
|
|
||||||
'spacy.tests.morphology',
|
|
||||||
'spacy.tests.munge',
|
|
||||||
'spacy.tests.parser',
|
|
||||||
'spacy.tests.serialize',
|
|
||||||
'spacy.tests.spans',
|
|
||||||
'spacy.tests.tagger',
|
|
||||||
'spacy.tests.tokenizer',
|
|
||||||
'spacy.tests.tokens',
|
|
||||||
'spacy.tests.vectors',
|
|
||||||
'spacy.tests.vocab'],
|
|
||||||
description="Industrial-strength NLP",
|
|
||||||
author='Matthew Honnibal',
|
|
||||||
author_email='honnibal@gmail.com',
|
|
||||||
version=VERSION,
|
|
||||||
url="http://honnibal.github.io/spaCy/",
|
|
||||||
package_data={"spacy": ["*.pxd"],
|
|
||||||
"spacy.en": ["*.pxd", "data/pos/*",
|
|
||||||
"data/wordnet/*", "data/tokenizer/*",
|
|
||||||
"data/vocab/lexemes.bin",
|
|
||||||
"data/vocab/serializer.json",
|
|
||||||
"data/vocab/oov_prob",
|
|
||||||
"data/vocab/strings.txt"],
|
|
||||||
"spacy.syntax": ["*.pxd"]},
|
|
||||||
ext_modules=exts,
|
|
||||||
license="MIT",
|
|
||||||
install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
|
|
||||||
'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
|
|
||||||
'ujson', 'cloudpickle'],
|
|
||||||
setup_requires=["headers_workaround"],
|
|
||||||
cmdclass = {'build_ext': build_ext_subclass },
|
|
||||||
)
|
|
||||||
|
|
||||||
import headers_workaround
|
|
||||||
|
|
||||||
headers_workaround.fix_venv_pypy_include()
|
|
||||||
headers_workaround.install_headers('murmurhash')
|
|
||||||
headers_workaround.install_headers('numpy')
|
|
||||||
|
|
||||||
|
|
||||||
VERSION = '0.97'
|
|
||||||
def main(modules, is_pypy):
|
|
||||||
language = "cpp"
|
|
||||||
includes = ['.', path.join(sys.prefix, 'include')]
|
|
||||||
if sys.platform.startswith('darwin'):
|
|
||||||
compile_options['other'].append('-mmacosx-version-min=10.8')
|
|
||||||
compile_options['other'].append('-stdlib=libc++')
|
|
||||||
link_options['other'].append('-lc++')
|
|
||||||
if use_cython:
|
|
||||||
cython_setup(modules, language, includes)
|
|
||||||
else:
|
|
||||||
exts = [c_ext(mn, language, includes)
|
|
||||||
for mn in modules]
|
|
||||||
run_setup(exts)
|
|
||||||
|
|
||||||
MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
|
||||||
'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
|
|
||||||
'spacy.morphology', 'spacy.tagger',
|
|
||||||
'spacy.syntax.stateclass',
|
|
||||||
'spacy._ml', 'spacy._theano',
|
|
||||||
'spacy.tokenizer',
|
|
||||||
'spacy.syntax.parser',
|
|
||||||
'spacy.syntax.transition_system',
|
|
||||||
'spacy.syntax.arc_eager',
|
|
||||||
'spacy.syntax._parse_features',
|
|
||||||
'spacy.gold', 'spacy.orth',
|
|
||||||
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
|
|
||||||
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
|
|
||||||
'spacy.cfile', 'spacy.matcher',
|
|
||||||
'spacy.syntax.ner',
|
|
||||||
'spacy.symbols']
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if sys.argv[1] == 'clean':
|
|
||||||
clean(MOD_NAMES)
|
|
||||||
else:
|
|
||||||
use_cython = sys.argv[1] == 'build_ext'
|
|
||||||
main(MOD_NAMES, use_cython)
|
|
|
@ -439,11 +439,23 @@ cdef class Doc:
|
||||||
keep_reading = False
|
keep_reading = False
|
||||||
yield n_bytes_str + data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
# This function is terrible --- need to fix this.
|
|
||||||
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
def token_index_start(self, int start_idx):
|
||||||
unicode ent_type):
|
cdef int i
|
||||||
"""Merge a multi-word expression into a single token. Currently
|
for i in range(self.length):
|
||||||
experimental; API is likely to change."""
|
if self.data[i].idx == start_idx:
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
def token_index_end(self, int end_idx):
|
||||||
|
cdef int i
|
||||||
|
for i in range(self.length):
|
||||||
|
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
|
||||||
|
return i + 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
def range_from_indices(self, int start_idx, int end_idx):
|
||||||
|
assert start_idx < end_idx
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
cdef int end = -1
|
cdef int end = -1
|
||||||
|
@ -454,10 +466,18 @@ cdef class Doc:
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
end = i + 1
|
end = i + 1
|
||||||
break
|
return (start, end)
|
||||||
else:
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
|
# This function is terrible --- need to fix this.
|
||||||
|
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
|
||||||
|
unicode ent_type):
|
||||||
|
"""Merge a multi-word expression into a single token. Currently
|
||||||
|
experimental; API is likely to change."""
|
||||||
|
start_end = self.range_from_indices(start_idx, end_idx)
|
||||||
|
if start_end is None:
|
||||||
|
return None
|
||||||
|
start, end = start_end
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
|
|
|
@ -4,8 +4,10 @@ from .doc cimport Doc
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Doc doc
|
cdef readonly Doc doc
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int start
|
cdef public int start_token
|
||||||
cdef public int end
|
cdef public int end_token
|
||||||
|
cdef public int start_idx
|
||||||
|
cdef public int end_idx
|
||||||
cdef readonly int label
|
cdef readonly int label
|
||||||
|
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
|
|
|
@ -21,8 +21,11 @@ cdef class Span:
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
||||||
self.doc = tokens
|
self.doc = tokens
|
||||||
self.start = start
|
# keep char offsets - as these don't change when merging spans
|
||||||
self.end = end
|
self.start_token = start
|
||||||
|
self.start_idx = self.doc[start].idx
|
||||||
|
self.end_token = end
|
||||||
|
self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
|
||||||
self.label = label
|
self.label = label
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
@ -76,6 +79,30 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
property start:
|
||||||
|
def __get__(self):
|
||||||
|
# if we haven't merged anything below check is false - so we get start token
|
||||||
|
if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
|
||||||
|
new_start = self.doc.token_index_start(self.start_idx)
|
||||||
|
if new_start is not None:
|
||||||
|
self.start_token = new_start
|
||||||
|
else:
|
||||||
|
raise IndexError('Something went terribly wrong during a merge.'
|
||||||
|
'No token found with idx %s' % self.start_idx)
|
||||||
|
return self.start_token
|
||||||
|
|
||||||
|
property end:
|
||||||
|
def __get__(self):
|
||||||
|
# if we haven't merged anything we have fast access
|
||||||
|
if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
|
||||||
|
new_end = self.doc.token_index_end(self.end_idx)
|
||||||
|
if new_end is not None:
|
||||||
|
self.end_token = new_end
|
||||||
|
else:
|
||||||
|
raise IndexError('Something went terribly wrong during a merge.'
|
||||||
|
'No token found with idx %s' % self.end_idx)
|
||||||
|
return self.end_token
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user