mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
* Merge string views feature branch
This commit is contained in:
commit
150cf6dd3b
177
docs/Makefile
177
docs/Makefile
|
@ -1,177 +0,0 @@
|
||||||
# Makefile for Sphinx documentation
|
|
||||||
#
|
|
||||||
|
|
||||||
# You can set these variables from the command line.
|
|
||||||
SPHINXOPTS =
|
|
||||||
SPHINXBUILD = sphinx-build
|
|
||||||
PAPER =
|
|
||||||
BUILDDIR = _build
|
|
||||||
|
|
||||||
# User-friendly check for sphinx-build
|
|
||||||
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
|
|
||||||
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
|
|
||||||
endif
|
|
||||||
|
|
||||||
# Internal variables.
|
|
||||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
|
||||||
PAPEROPT_letter = -D latex_paper_size=letter
|
|
||||||
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
|
||||||
# the i18n builder cannot share the environment and doctrees with the others
|
|
||||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
|
||||||
|
|
||||||
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
|
||||||
|
|
||||||
help:
|
|
||||||
@echo "Please use \`make <target>' where <target> is one of"
|
|
||||||
@echo " html to make standalone HTML files"
|
|
||||||
@echo " dirhtml to make HTML files named index.html in directories"
|
|
||||||
@echo " singlehtml to make a single large HTML file"
|
|
||||||
@echo " pickle to make pickle files"
|
|
||||||
@echo " json to make JSON files"
|
|
||||||
@echo " htmlhelp to make HTML files and a HTML help project"
|
|
||||||
@echo " qthelp to make HTML files and a qthelp project"
|
|
||||||
@echo " devhelp to make HTML files and a Devhelp project"
|
|
||||||
@echo " epub to make an epub"
|
|
||||||
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
|
||||||
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
|
||||||
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
|
||||||
@echo " text to make text files"
|
|
||||||
@echo " man to make manual pages"
|
|
||||||
@echo " texinfo to make Texinfo files"
|
|
||||||
@echo " info to make Texinfo files and run them through makeinfo"
|
|
||||||
@echo " gettext to make PO message catalogs"
|
|
||||||
@echo " changes to make an overview of all changed/added/deprecated items"
|
|
||||||
@echo " xml to make Docutils-native XML files"
|
|
||||||
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
|
||||||
@echo " linkcheck to check all external links for integrity"
|
|
||||||
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -rf $(BUILDDIR)/*
|
|
||||||
|
|
||||||
html:
|
|
||||||
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
|
||||||
|
|
||||||
dirhtml:
|
|
||||||
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
|
||||||
|
|
||||||
singlehtml:
|
|
||||||
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
|
||||||
|
|
||||||
pickle:
|
|
||||||
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
|
||||||
@echo
|
|
||||||
@echo "Build finished; now you can process the pickle files."
|
|
||||||
|
|
||||||
json:
|
|
||||||
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
|
||||||
@echo
|
|
||||||
@echo "Build finished; now you can process the JSON files."
|
|
||||||
|
|
||||||
htmlhelp:
|
|
||||||
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
|
||||||
@echo
|
|
||||||
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
|
||||||
".hhp project file in $(BUILDDIR)/htmlhelp."
|
|
||||||
|
|
||||||
qthelp:
|
|
||||||
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
|
||||||
@echo
|
|
||||||
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
|
||||||
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
|
||||||
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spacy.qhcp"
|
|
||||||
@echo "To view the help file:"
|
|
||||||
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spacy.qhc"
|
|
||||||
|
|
||||||
devhelp:
|
|
||||||
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
|
||||||
@echo
|
|
||||||
@echo "Build finished."
|
|
||||||
@echo "To view the help file:"
|
|
||||||
@echo "# mkdir -p $$HOME/.local/share/devhelp/spacy"
|
|
||||||
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spacy"
|
|
||||||
@echo "# devhelp"
|
|
||||||
|
|
||||||
epub:
|
|
||||||
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
|
||||||
|
|
||||||
latex:
|
|
||||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
|
||||||
@echo
|
|
||||||
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
|
||||||
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
|
||||||
"(use \`make latexpdf' here to do that automatically)."
|
|
||||||
|
|
||||||
latexpdf:
|
|
||||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
|
||||||
@echo "Running LaTeX files through pdflatex..."
|
|
||||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
|
||||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
|
||||||
|
|
||||||
latexpdfja:
|
|
||||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
|
||||||
@echo "Running LaTeX files through platex and dvipdfmx..."
|
|
||||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
|
|
||||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
|
||||||
|
|
||||||
text:
|
|
||||||
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
|
||||||
|
|
||||||
man:
|
|
||||||
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
|
||||||
|
|
||||||
texinfo:
|
|
||||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
|
||||||
@echo "Run \`make' in that directory to run these through makeinfo" \
|
|
||||||
"(use \`make info' here to do that automatically)."
|
|
||||||
|
|
||||||
info:
|
|
||||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
|
||||||
@echo "Running Texinfo files through makeinfo..."
|
|
||||||
make -C $(BUILDDIR)/texinfo info
|
|
||||||
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
|
||||||
|
|
||||||
gettext:
|
|
||||||
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
|
||||||
|
|
||||||
changes:
|
|
||||||
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
|
||||||
@echo
|
|
||||||
@echo "The overview file is in $(BUILDDIR)/changes."
|
|
||||||
|
|
||||||
linkcheck:
|
|
||||||
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
|
||||||
@echo
|
|
||||||
@echo "Link check complete; look for any errors in the above output " \
|
|
||||||
"or in $(BUILDDIR)/linkcheck/output.txt."
|
|
||||||
|
|
||||||
doctest:
|
|
||||||
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
|
||||||
@echo "Testing of doctests in the sources finished, look at the " \
|
|
||||||
"results in $(BUILDDIR)/doctest/output.txt."
|
|
||||||
|
|
||||||
xml:
|
|
||||||
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
|
|
||||||
|
|
||||||
pseudoxml:
|
|
||||||
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
|
|
||||||
@echo
|
|
||||||
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
|
|
269
docs/conf.py
269
docs/conf.py
|
@ -1,269 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
#
|
|
||||||
# spacy documentation build configuration file, created by
|
|
||||||
# sphinx-quickstart on Thu Jul 3 21:54:08 2014.
|
|
||||||
#
|
|
||||||
# This file is execfile()d with the current directory set to its
|
|
||||||
# containing dir.
|
|
||||||
#
|
|
||||||
# Note that not all possible configuration values are present in this
|
|
||||||
# autogenerated file.
|
|
||||||
#
|
|
||||||
# All configuration values have a default; values that are commented out
|
|
||||||
# serve to show the default.
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# If extensions (or modules to document with autodoc) are in another directory,
|
|
||||||
# add these directories to sys.path here. If the directory is relative to the
|
|
||||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
|
||||||
sys.path.insert(0, os.path.abspath('.'))
|
|
||||||
sys.path.insert(0, os.path.join(os.path.abspath('.'), 'spacy'))
|
|
||||||
|
|
||||||
# -- General configuration ------------------------------------------------
|
|
||||||
|
|
||||||
# If your documentation needs a minimal Sphinx version, state it here.
|
|
||||||
#needs_sphinx = '1.0'
|
|
||||||
|
|
||||||
# Add any Sphinx extension module names here, as strings. They can be
|
|
||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
|
||||||
# ones.
|
|
||||||
extensions = [
|
|
||||||
'sphinx.ext.autodoc',
|
|
||||||
'sphinx.ext.doctest',
|
|
||||||
'sphinx.ext.intersphinx',
|
|
||||||
'sphinx.ext.coverage',
|
|
||||||
'sphinx.ext.viewcode',
|
|
||||||
]
|
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
|
||||||
templates_path = ['_templates']
|
|
||||||
|
|
||||||
# The suffix of source filenames.
|
|
||||||
source_suffix = '.rst'
|
|
||||||
|
|
||||||
# The encoding of source files.
|
|
||||||
#source_encoding = 'utf-8-sig'
|
|
||||||
|
|
||||||
# The master toctree document.
|
|
||||||
master_doc = 'index'
|
|
||||||
|
|
||||||
# General information about the project.
|
|
||||||
project = u'spacy'
|
|
||||||
copyright = u'2014, Matthew Honnibal'
|
|
||||||
|
|
||||||
# The version info for the project you're documenting, acts as replacement for
|
|
||||||
# |version| and |release|, also used in various other places throughout the
|
|
||||||
# built documents.
|
|
||||||
#
|
|
||||||
# The short X.Y version.
|
|
||||||
version = '0.0'
|
|
||||||
# The full version, including alpha/beta/rc tags.
|
|
||||||
release = '0.0'
|
|
||||||
|
|
||||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
|
||||||
# for a list of supported languages.
|
|
||||||
#language = None
|
|
||||||
|
|
||||||
# There are two options for replacing |today|: either, you set today to some
|
|
||||||
# non-false value, then it is used:
|
|
||||||
#today = ''
|
|
||||||
# Else, today_fmt is used as the format for a strftime call.
|
|
||||||
#today_fmt = '%B %d, %Y'
|
|
||||||
|
|
||||||
# List of patterns, relative to source directory, that match files and
|
|
||||||
# directories to ignore when looking for source files.
|
|
||||||
exclude_patterns = ['_build']
|
|
||||||
|
|
||||||
# The reST default role (used for this markup: `text`) to use for all
|
|
||||||
# documents.
|
|
||||||
#default_role = None
|
|
||||||
|
|
||||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
|
||||||
#add_function_parentheses = True
|
|
||||||
|
|
||||||
# If true, the current module name will be prepended to all description
|
|
||||||
# unit titles (such as .. function::).
|
|
||||||
#add_module_names = True
|
|
||||||
|
|
||||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
|
||||||
# output. They are ignored by default.
|
|
||||||
#show_authors = False
|
|
||||||
|
|
||||||
# The name of the Pygments (syntax highlighting) style to use.
|
|
||||||
pygments_style = 'sphinx'
|
|
||||||
|
|
||||||
# A list of ignored prefixes for module index sorting.
|
|
||||||
#modindex_common_prefix = []
|
|
||||||
|
|
||||||
# If true, keep warnings as "system message" paragraphs in the built documents.
|
|
||||||
#keep_warnings = False
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for HTML output ----------------------------------------------
|
|
||||||
|
|
||||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
|
||||||
# a list of builtin themes.
|
|
||||||
html_theme = 'default'
|
|
||||||
|
|
||||||
# Theme options are theme-specific and customize the look and feel of a theme
|
|
||||||
# further. For a list of options available for each theme, see the
|
|
||||||
# documentation.
|
|
||||||
#html_theme_options = {}
|
|
||||||
|
|
||||||
# Add any paths that contain custom themes here, relative to this directory.
|
|
||||||
#html_theme_path = []
|
|
||||||
|
|
||||||
# The name for this set of Sphinx documents. If None, it defaults to
|
|
||||||
# "<project> v<release> documentation".
|
|
||||||
#html_title = None
|
|
||||||
|
|
||||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
|
||||||
#html_short_title = None
|
|
||||||
|
|
||||||
# The name of an image file (relative to this directory) to place at the top
|
|
||||||
# of the sidebar.
|
|
||||||
#html_logo = None
|
|
||||||
|
|
||||||
# The name of an image file (within the static path) to use as favicon of the
|
|
||||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
|
||||||
# pixels large.
|
|
||||||
#html_favicon = None
|
|
||||||
|
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
|
||||||
html_static_path = ['_static']
|
|
||||||
|
|
||||||
# Add any extra paths that contain custom files (such as robots.txt or
|
|
||||||
# .htaccess) here, relative to this directory. These files are copied
|
|
||||||
# directly to the root of the documentation.
|
|
||||||
#html_extra_path = []
|
|
||||||
|
|
||||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
|
||||||
# using the given strftime format.
|
|
||||||
#html_last_updated_fmt = '%b %d, %Y'
|
|
||||||
|
|
||||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
|
||||||
# typographically correct entities.
|
|
||||||
#html_use_smartypants = True
|
|
||||||
|
|
||||||
# Custom sidebar templates, maps document names to template names.
|
|
||||||
#html_sidebars = {}
|
|
||||||
|
|
||||||
# Additional templates that should be rendered to pages, maps page names to
|
|
||||||
# template names.
|
|
||||||
#html_additional_pages = {}
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#html_domain_indices = True
|
|
||||||
|
|
||||||
# If false, no index is generated.
|
|
||||||
#html_use_index = True
|
|
||||||
|
|
||||||
# If true, the index is split into individual pages for each letter.
|
|
||||||
#html_split_index = False
|
|
||||||
|
|
||||||
# If true, links to the reST sources are added to the pages.
|
|
||||||
#html_show_sourcelink = True
|
|
||||||
|
|
||||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
|
||||||
#html_show_sphinx = True
|
|
||||||
|
|
||||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
|
||||||
#html_show_copyright = True
|
|
||||||
|
|
||||||
# If true, an OpenSearch description file will be output, and all pages will
|
|
||||||
# contain a <link> tag referring to it. The value of this option must be the
|
|
||||||
# base URL from which the finished HTML is served.
|
|
||||||
#html_use_opensearch = ''
|
|
||||||
|
|
||||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
|
||||||
#html_file_suffix = None
|
|
||||||
|
|
||||||
# Output file base name for HTML help builder.
|
|
||||||
htmlhelp_basename = 'spacydoc'
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for LaTeX output ---------------------------------------------
|
|
||||||
|
|
||||||
latex_elements = {
|
|
||||||
# The paper size ('letterpaper' or 'a4paper').
|
|
||||||
#'papersize': 'letterpaper',
|
|
||||||
|
|
||||||
# The font size ('10pt', '11pt' or '12pt').
|
|
||||||
#'pointsize': '10pt',
|
|
||||||
|
|
||||||
# Additional stuff for the LaTeX preamble.
|
|
||||||
#'preamble': '',
|
|
||||||
}
|
|
||||||
|
|
||||||
# Grouping the document tree into LaTeX files. List of tuples
|
|
||||||
# (source start file, target name, title,
|
|
||||||
# author, documentclass [howto, manual, or own class]).
|
|
||||||
latex_documents = [
|
|
||||||
('index', 'spacy.tex', u'spacy Documentation',
|
|
||||||
u'Matthew Honnibal', 'manual'),
|
|
||||||
]
|
|
||||||
|
|
||||||
# The name of an image file (relative to this directory) to place at the top of
|
|
||||||
# the title page.
|
|
||||||
#latex_logo = None
|
|
||||||
|
|
||||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
|
||||||
# not chapters.
|
|
||||||
#latex_use_parts = False
|
|
||||||
|
|
||||||
# If true, show page references after internal links.
|
|
||||||
#latex_show_pagerefs = False
|
|
||||||
|
|
||||||
# If true, show URL addresses after external links.
|
|
||||||
#latex_show_urls = False
|
|
||||||
|
|
||||||
# Documents to append as an appendix to all manuals.
|
|
||||||
#latex_appendices = []
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#latex_domain_indices = True
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for manual page output ---------------------------------------
|
|
||||||
|
|
||||||
# One entry per manual page. List of tuples
|
|
||||||
# (source start file, name, description, authors, manual section).
|
|
||||||
man_pages = [
|
|
||||||
('index', 'spacy', u'spacy Documentation',
|
|
||||||
[u'Matthew Honnibal'], 1)
|
|
||||||
]
|
|
||||||
|
|
||||||
# If true, show URL addresses after external links.
|
|
||||||
#man_show_urls = False
|
|
||||||
|
|
||||||
|
|
||||||
# -- Options for Texinfo output -------------------------------------------
|
|
||||||
|
|
||||||
# Grouping the document tree into Texinfo files. List of tuples
|
|
||||||
# (source start file, target name, title, author,
|
|
||||||
# dir menu entry, description, category)
|
|
||||||
texinfo_documents = [
|
|
||||||
('index', 'spacy', u'spacy Documentation',
|
|
||||||
u'Matthew Honnibal', 'spacy', 'One line description of project.',
|
|
||||||
'Miscellaneous'),
|
|
||||||
]
|
|
||||||
|
|
||||||
# Documents to append as an appendix to all manuals.
|
|
||||||
#texinfo_appendices = []
|
|
||||||
|
|
||||||
# If false, no module index is generated.
|
|
||||||
#texinfo_domain_indices = True
|
|
||||||
|
|
||||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
|
||||||
#texinfo_show_urls = 'footnote'
|
|
||||||
|
|
||||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
|
||||||
#texinfo_no_detailmenu = False
|
|
||||||
|
|
||||||
|
|
||||||
# Example configuration for intersphinx: refer to the Python standard library.
|
|
||||||
intersphinx_mapping = {'http://docs.python.org/': None}
|
|
|
@ -1,26 +0,0 @@
|
||||||
.. spacy documentation master file, created by
|
|
||||||
sphinx-quickstart on Thu Jul 3 21:54:08 2014.
|
|
||||||
You can adapt this file completely to your liking, but it should at least
|
|
||||||
contain the root `toctree` directive.
|
|
||||||
|
|
||||||
Welcome to spacy's documentation!
|
|
||||||
=================================
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 2
|
|
||||||
|
|
||||||
.. py:function:: enumerate(sequence[, start=0])
|
|
||||||
|
|
||||||
Return an iterator that yields tuples of an index and an item of the
|
|
||||||
*sequence*. (And so on.)
|
|
||||||
|
|
||||||
|
|
||||||
Indices and tables
|
|
||||||
==================
|
|
||||||
|
|
||||||
* :ref:`genindex`
|
|
||||||
* :ref:`modindex`
|
|
||||||
* :ref:`search`
|
|
||||||
|
|
|
@ -1,8 +1,17 @@
|
||||||
from .lexeme import lex_of
|
from .lexeme import lex_of
|
||||||
from .lexeme import sic_of
|
from .lexeme import sic_of
|
||||||
|
|
||||||
|
from .tokens import Tokens
|
||||||
|
|
||||||
__all__ = [lex_of, sic_of]
|
# Don't know how to get the enum Python visible :(
|
||||||
|
|
||||||
|
SIC = 0
|
||||||
|
LEX = 1
|
||||||
|
NORM = 2
|
||||||
|
SHAPE = 3
|
||||||
|
LAST3 = 4
|
||||||
|
|
||||||
|
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.spacy cimport Lexeme_addr
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
|
|
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
from spacy.string_tools cimport substr
|
from spacy.string_tools cimport substr
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -37,12 +36,15 @@ cdef class English(spacy.Language):
|
||||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
|
# ...Unless we're at 0
|
||||||
|
return i == 0
|
||||||
|
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
||||||
return False
|
return False
|
||||||
# Don't count commas as punct if the next char is a number
|
# Don't count commas as punct if the next char is a number
|
||||||
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
||||||
return False
|
return False
|
||||||
# Don't count periods as punct if the next char is a number
|
# Don't count periods as punct if the next char is not whitespace
|
||||||
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
|
||||||
return False
|
return False
|
||||||
return not word[i].isalnum()
|
return not word[i].isalnum()
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.spacy cimport StringHash
|
from spacy.spacy cimport StringHash
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
from spacy.spacy cimport Lexeme_addr
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,11 @@ boldly assume no collisions.
|
||||||
'''
|
'''
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
|
||||||
from spacy.string_tools cimport substr
|
from spacy.string_tools cimport substr
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
from . import util
|
from . import util
|
||||||
|
|
|
@ -4,39 +4,53 @@ from libc.stdint cimport uint64_t
|
||||||
ctypedef int ClusterID
|
ctypedef int ClusterID
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
|
ctypedef char Bits8
|
||||||
|
ctypedef uint64_t Bits64
|
||||||
|
|
||||||
|
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Orthography:
|
||||||
|
StringHash last3
|
||||||
|
StringHash shape
|
||||||
|
StringHash norm
|
||||||
|
|
||||||
|
Py_UNICODE first
|
||||||
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct Distribution:
|
||||||
|
double prob
|
||||||
|
ClusterID cluster
|
||||||
|
Bits64 tagdict
|
||||||
|
Bits8 flags
|
||||||
|
|
||||||
|
|
||||||
cdef struct Lexeme:
|
cdef struct Lexeme:
|
||||||
StringHash sic # Hash of the original string
|
StringHash sic # Hash of the original string
|
||||||
StringHash lex # Hash of the word, with punctuation and clitics split off
|
StringHash lex # Hash of the word, with punctuation and clitics split off
|
||||||
StringHash normed # Hash of the normalized version of lex
|
|
||||||
StringHash last3 # Last 3 characters of the token
|
|
||||||
Py_UNICODE first # First character of the token
|
|
||||||
|
|
||||||
double prob # What is the log probability of the lex value?
|
Distribution* dist # Distribution info, lazy loaded
|
||||||
ClusterID cluster # Brown cluster of the token
|
Orthography* orth # Extra orthographic views
|
||||||
|
|
||||||
bint oft_upper # Is the lowered version of the lex value often in all caps?
|
|
||||||
bint oft_title # Is the lowered version of the lex value often title-cased?
|
|
||||||
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
||||||
|
|
||||||
|
|
||||||
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
||||||
|
|
||||||
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
|
||||||
int split, size_t length)
|
|
||||||
|
|
||||||
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
|
|
||||||
# has a conditional to pick out the correct item. This allows safe iteration
|
|
||||||
# over the Lexeme, via:
|
|
||||||
# for field in range(LexAttr.n): get_attr(Lexeme*, field)
|
|
||||||
cdef enum HashFields:
|
|
||||||
sic
|
|
||||||
lex
|
|
||||||
normed
|
|
||||||
cluster
|
|
||||||
n
|
|
||||||
|
|
||||||
|
|
||||||
#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
|
cdef enum StringAttr:
|
||||||
|
SIC
|
||||||
|
LEX
|
||||||
|
NORM
|
||||||
|
SHAPE
|
||||||
|
LAST3
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
||||||
|
|
||||||
|
cpdef StringHash sic_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
||||||
|
cpdef StringHash last3_of(size_t lex_id) except 0
|
||||||
|
|
114
spacy/lexeme.pyx
114
spacy/lexeme.pyx
|
@ -11,49 +11,29 @@ from libc.stdlib cimport malloc, calloc, free
|
||||||
from libc.stdint cimport uint64_t
|
from libc.stdint cimport uint64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
|
# Reiterate the enum, for python
|
||||||
int split, size_t length):
|
#SIC = StringAttr.sic
|
||||||
assert split <= length
|
#LEX = StringAttr.lex
|
||||||
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
#NORM = StringAttr.norm
|
||||||
|
#SHAPE = StringAttr.shape
|
||||||
|
#LAST3 = StringAttr.last3
|
||||||
|
|
||||||
word.first = <Py_UNICODE>(string[0] if string else 0)
|
|
||||||
word.sic = hashed
|
|
||||||
|
|
||||||
cdef unicode tail_string
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
|
||||||
cdef unicode lex
|
if attr == SIC:
|
||||||
if split != 0 and split < length:
|
return sic_of(lex_id)
|
||||||
lex = substr(string, 0, split, length)
|
elif attr == LEX:
|
||||||
tail_string = substr(string, split, length, length)
|
return lex_of(lex_id)
|
||||||
|
elif attr == NORM:
|
||||||
|
return norm_of(lex_id)
|
||||||
|
elif attr == SHAPE:
|
||||||
|
return shape_of(lex_id)
|
||||||
|
elif attr == LAST3:
|
||||||
|
return last3_of(lex_id)
|
||||||
else:
|
else:
|
||||||
lex = string
|
raise StandardError
|
||||||
tail_string = ''
|
|
||||||
assert lex
|
|
||||||
#cdef unicode normed = normalize_word_string(lex)
|
|
||||||
cdef unicode normed = '?'
|
|
||||||
cdef unicode last3 = substr(string, length - 3, length, length)
|
|
||||||
|
|
||||||
assert normed
|
|
||||||
assert len(normed)
|
|
||||||
|
|
||||||
word.lex = lang.hash_string(lex, len(lex))
|
|
||||||
word.normed = lang.hash_string(normed, len(normed))
|
|
||||||
word.last3 = lang.hash_string(last3, len(last3))
|
|
||||||
|
|
||||||
lang.bacov[word.lex] = lex
|
|
||||||
lang.bacov[word.normed] = normed
|
|
||||||
lang.bacov[word.last3] = last3
|
|
||||||
|
|
||||||
# These are loaded later
|
|
||||||
word.prob = 0
|
|
||||||
word.cluster = 0
|
|
||||||
word.oft_upper = False
|
|
||||||
word.oft_title = False
|
|
||||||
|
|
||||||
# Now recurse, and deal with the tail
|
|
||||||
if tail_string:
|
|
||||||
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
|
|
||||||
return word
|
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash sic_of(size_t lex_id) except 0:
|
cpdef StringHash sic_of(size_t lex_id) except 0:
|
||||||
|
@ -82,6 +62,35 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
|
||||||
return (<Lexeme*>lex_id).lex
|
return (<Lexeme*>lex_id).lex
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash norm_of(size_t lex_id) except 0:
|
||||||
|
'''Access the `lex' field of the Lexeme pointed to by lex_id.
|
||||||
|
|
||||||
|
The lex field is the hash of the string you would expect to get back from
|
||||||
|
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
|
||||||
|
delimited tokens split off. The other fields refer to properties of the
|
||||||
|
string that the lex field stores a hash of, except sic and tail.
|
||||||
|
|
||||||
|
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
|
||||||
|
[u'Hi', u'!', u'world']
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).orth.norm
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash shape_of(size_t lex_id) except 0:
|
||||||
|
return (<Lexeme*>lex_id).orth.shape
|
||||||
|
|
||||||
|
|
||||||
|
cpdef StringHash last3_of(size_t lex_id) except 0:
|
||||||
|
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
||||||
|
the hash of the last three characters of the word:
|
||||||
|
|
||||||
|
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
||||||
|
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
||||||
|
[u'llo', u'!']
|
||||||
|
'''
|
||||||
|
return (<Lexeme*>lex_id).orth.last3
|
||||||
|
|
||||||
|
|
||||||
cpdef ClusterID cluster_of(size_t lex_id):
|
cpdef ClusterID cluster_of(size_t lex_id):
|
||||||
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
|
||||||
gives an integer representation of the cluster ID of the word,
|
gives an integer representation of the cluster ID of the word,
|
||||||
|
@ -98,7 +107,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).cluster
|
return (<Lexeme*>lex_id).dist.cluster
|
||||||
|
|
||||||
|
|
||||||
cpdef Py_UNICODE first_of(size_t lex_id):
|
cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
|
@ -109,7 +118,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
|
||||||
>>> unhash(first_of(lex_id))
|
>>> unhash(first_of(lex_id))
|
||||||
u'H'
|
u'H'
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).first
|
return (<Lexeme*>lex_id).orth.first
|
||||||
|
|
||||||
|
|
||||||
cpdef double prob_of(size_t lex_id):
|
cpdef double prob_of(size_t lex_id):
|
||||||
|
@ -122,18 +131,7 @@ cpdef double prob_of(size_t lex_id):
|
||||||
>>> prob_of(lookup(u'world'))
|
>>> prob_of(lookup(u'world'))
|
||||||
-20.10340371976182
|
-20.10340371976182
|
||||||
'''
|
'''
|
||||||
pass
|
return (<Lexeme*>lex_id).dist.prob
|
||||||
|
|
||||||
|
|
||||||
cpdef StringHash last3_of(size_t lex_id):
|
|
||||||
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
|
|
||||||
the hash of the last three characters of the word:
|
|
||||||
|
|
||||||
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
|
|
||||||
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
|
|
||||||
[u'llo', u'!']
|
|
||||||
'''
|
|
||||||
return (<Lexeme*>lex_id).last3
|
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_upper(size_t lex_id):
|
cpdef bint is_oft_upper(size_t lex_id):
|
||||||
|
@ -148,7 +146,12 @@ cpdef bint is_oft_upper(size_t lex_id):
|
||||||
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).oft_upper
|
return False
|
||||||
|
#cdef Lexeme* w = <Lexeme*>lex_id
|
||||||
|
#return w.orth.last3 if w.orth != NULL else 0
|
||||||
|
|
||||||
|
|
||||||
|
#return (<Lexeme*>lex_id).oft_upper
|
||||||
|
|
||||||
|
|
||||||
cpdef bint is_oft_title(size_t lex_id):
|
cpdef bint is_oft_title(size_t lex_id):
|
||||||
|
@ -163,4 +166,5 @@ cpdef bint is_oft_title(size_t lex_id):
|
||||||
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
|
||||||
True
|
True
|
||||||
'''
|
'''
|
||||||
return (<Lexeme*>lex_id).oft_title
|
return False
|
||||||
|
#return (<Lexeme*>lex_id).oft_title
|
||||||
|
|
|
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
|
||||||
# Circular import problems here
|
# Circular import problems here
|
||||||
ctypedef size_t Lexeme_addr
|
ctypedef size_t Lexeme_addr
|
||||||
ctypedef uint64_t StringHash
|
ctypedef uint64_t StringHash
|
||||||
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
|
ctypedef dense_hash_map[StringHash, size_t] Vocab
|
||||||
ctypedef int (*Splitter)(unicode word, size_t length)
|
from spacy.lexeme cimport Lexeme
|
||||||
|
|
||||||
|
from spacy.tokens cimport Tokens
|
||||||
|
|
||||||
|
# Put these above import to avoid circular import problem
|
||||||
|
ctypedef char Bits8
|
||||||
|
ctypedef uint64_t Bits64
|
||||||
|
ctypedef int ClusterID
|
||||||
|
|
||||||
|
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.tokens cimport Tokens
|
from spacy.lexeme cimport Distribution
|
||||||
|
from spacy.lexeme cimport Orthography
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
cdef object name
|
cdef object name
|
||||||
cdef Vocab* vocab
|
cdef Vocab* vocab
|
||||||
|
cdef Vocab* distri
|
||||||
|
cdef Vocab* ortho
|
||||||
cdef dict bacov
|
cdef dict bacov
|
||||||
cdef int find_split(self, unicode word, size_t length)
|
cdef int find_split(self, unicode word, size_t length)
|
||||||
|
|
||||||
|
@ -26,3 +37,8 @@ cdef class Language:
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
|
||||||
|
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||||
|
int split, size_t length)
|
||||||
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
|
||||||
|
|
||||||
|
|
||||||
|
|
100
spacy/spacy.pyx
100
spacy/spacy.pyx
|
@ -6,22 +6,65 @@ from libc.stdlib cimport calloc, free
|
||||||
from ext.murmurhash cimport MurmurHash64A
|
from ext.murmurhash cimport MurmurHash64A
|
||||||
from ext.murmurhash cimport MurmurHash64B
|
from ext.murmurhash cimport MurmurHash64B
|
||||||
|
|
||||||
from spacy.lexeme cimport init_lexeme
|
from spacy.lexeme cimport Lexeme
|
||||||
from spacy.lexeme cimport BLANK_WORD
|
from spacy.lexeme cimport BLANK_WORD
|
||||||
|
|
||||||
from spacy.string_tools cimport is_whitespace
|
from spacy.string_tools cimport substr
|
||||||
|
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from os import path
|
from os import path
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
|
def get_normalized(unicode lex, size_t length):
|
||||||
|
if lex.isalpha() and lex.islower():
|
||||||
|
return lex
|
||||||
|
else:
|
||||||
|
return get_word_shape(lex, length)
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_shape(lex, length):
|
||||||
|
shape = ""
|
||||||
|
last = ""
|
||||||
|
shape_char = ""
|
||||||
|
seq = 0
|
||||||
|
for c in lex:
|
||||||
|
if c.isalpha():
|
||||||
|
if c.isupper():
|
||||||
|
shape_char = "X"
|
||||||
|
else:
|
||||||
|
shape_char = "x"
|
||||||
|
elif c.isdigit():
|
||||||
|
shape_char = "d"
|
||||||
|
else:
|
||||||
|
shape_char = c
|
||||||
|
if shape_char == last:
|
||||||
|
seq += 1
|
||||||
|
else:
|
||||||
|
seq = 0
|
||||||
|
last = shape_char
|
||||||
|
if seq < 3:
|
||||||
|
shape += shape_char
|
||||||
|
assert shape
|
||||||
|
return shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def set_orth_flags(lex, length):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __cinit__(self, name):
|
def __cinit__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.bacov = {}
|
self.bacov = {}
|
||||||
self.vocab = new Vocab()
|
self.vocab = new Vocab()
|
||||||
|
self.ortho = new Vocab()
|
||||||
|
self.distri = new Vocab()
|
||||||
self.vocab[0].set_empty_key(0)
|
self.vocab[0].set_empty_key(0)
|
||||||
|
self.distri[0].set_empty_key(0)
|
||||||
|
self.ortho[0].set_empty_key(0)
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
self.load_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
def load_tokenization(self, token_rules=None):
|
def load_tokenization(self, token_rules=None):
|
||||||
|
@ -80,7 +123,7 @@ cdef class Language:
|
||||||
return <Lexeme_addr>word_ptr
|
return <Lexeme_addr>word_ptr
|
||||||
|
|
||||||
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
|
||||||
word = init_lexeme(self, string, hashed, split, length)
|
word = self.init_lexeme(string, hashed, split, length)
|
||||||
self.vocab[0][hashed] = <Lexeme_addr>word
|
self.vocab[0][hashed] = <Lexeme_addr>word
|
||||||
self.bacov[hashed] = string
|
self.bacov[hashed] = string
|
||||||
return word
|
return word
|
||||||
|
@ -121,6 +164,55 @@ cdef class Language:
|
||||||
cdef int find_split(self, unicode word, size_t length):
|
cdef int find_split(self, unicode word, size_t length):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
|
||||||
|
int split, size_t length):
|
||||||
|
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
|
||||||
|
|
||||||
|
word.sic = hashed
|
||||||
|
|
||||||
|
cdef unicode tail_string
|
||||||
|
cdef unicode lex
|
||||||
|
if split != 0 and split < length:
|
||||||
|
lex = substr(string, 0, split, length)
|
||||||
|
tail_string = substr(string, split, length, length)
|
||||||
|
else:
|
||||||
|
lex = string
|
||||||
|
tail_string = ''
|
||||||
|
|
||||||
|
word.lex = self.hash_string(lex, len(lex))
|
||||||
|
self.bacov[word.lex] = lex
|
||||||
|
word.orth = <Orthography*>self.ortho[0][word.lex]
|
||||||
|
if word.orth == NULL:
|
||||||
|
word.orth = self.init_orth(word.lex, lex)
|
||||||
|
word.dist = <Distribution*>self.distri[0][word.lex]
|
||||||
|
|
||||||
|
# Now recurse, and deal with the tail
|
||||||
|
if tail_string:
|
||||||
|
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
|
||||||
|
return word
|
||||||
|
|
||||||
|
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
|
||||||
|
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
|
||||||
|
orth.first = <Py_UNICODE>lex[0]
|
||||||
|
|
||||||
|
cdef int length = len(lex)
|
||||||
|
|
||||||
|
orth.flags = set_orth_flags(lex, length)
|
||||||
|
|
||||||
|
cdef unicode last3 = substr(lex, length - 3, length, length)
|
||||||
|
cdef unicode norm = get_normalized(lex, length)
|
||||||
|
cdef unicode shape = get_word_shape(lex, length)
|
||||||
|
|
||||||
|
orth.last3 = self.hash_string(last3, len(last3))
|
||||||
|
orth.shape = self.hash_string(shape, len(shape))
|
||||||
|
orth.norm = self.hash_string(norm, len(norm))
|
||||||
|
|
||||||
|
self.bacov[orth.last3] = last3
|
||||||
|
self.bacov[orth.shape] = shape
|
||||||
|
self.bacov[orth.norm] = norm
|
||||||
|
|
||||||
|
self.ortho[0][hashed] = <size_t>orth
|
||||||
|
return orth
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
|
@ -137,7 +229,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
|
||||||
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
cpdef vector[size_t] expand_chunk(size_t addr) except *:
|
||||||
cdef vector[size_t] tokens = vector[size_t]()
|
cdef vector[size_t] tokens = vector[size_t]()
|
||||||
word = <Lexeme*>addr
|
word = <Lexeme*>addr
|
||||||
while word is not NULL:
|
while word != NULL:
|
||||||
tokens.push_back(<size_t>word)
|
tokens.push_back(<size_t>word)
|
||||||
word = word.tail
|
word = word.tail
|
||||||
return tokens
|
return tokens
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
cpdef unicode substr(unicode string, int start, int end, size_t length):
|
||||||
if end >= length:
|
if end >= length:
|
||||||
end = -1
|
end = -1
|
||||||
|
|
|
@ -1,12 +1,9 @@
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from spacy.lexeme cimport Lexeme
|
from spacy.spacy cimport Lexeme_addr
|
||||||
from spacy.lexeme cimport Lexeme_addr
|
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from spacy.spacy cimport Language
|
from spacy.spacy cimport Language
|
||||||
|
from spacy.lexeme cimport StringAttr
|
||||||
cdef enum Field:
|
|
||||||
lex
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
|
@ -17,5 +14,5 @@ cdef class Tokens:
|
||||||
cpdef int append(self, Lexeme_addr token)
|
cpdef int append(self, Lexeme_addr token)
|
||||||
cpdef int extend(self, Tokens other) except -1
|
cpdef int extend(self, Tokens other) except -1
|
||||||
|
|
||||||
cpdef list group_by(self, Field attr)
|
cpdef object group_by(self, StringAttr attr)
|
||||||
cpdef dict count_by(self, Field attr)
|
cpdef dict count_by(self, StringAttr attr)
|
||||||
|
|
|
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as inc
|
from cython.operator cimport preincrement as inc
|
||||||
|
|
||||||
|
|
||||||
|
from spacy.lexeme cimport Lexeme
|
||||||
|
from spacy.lexeme cimport attr_of, norm_of, shape_of
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
def __cinit__(self, Language lang):
|
def __cinit__(self, Language lang):
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
@ -32,17 +37,53 @@ cdef class Tokens:
|
||||||
for el in other:
|
for el in other:
|
||||||
self.append(el)
|
self.append(el)
|
||||||
|
|
||||||
cpdef list group_by(self, Field attr):
|
cpdef object group_by(self, StringAttr attr):
|
||||||
pass
|
'''Group tokens that share the property attr into Tokens instances, and
|
||||||
|
return a list of them. Returns a tuple of three lists:
|
||||||
|
|
||||||
cpdef dict count_by(self, Field attr):
|
(string names, hashes, tokens)
|
||||||
|
|
||||||
|
The lists are aligned, so the ith entry in string names is the string
|
||||||
|
that the ith entry in hashes unhashes to, which the Tokens instance
|
||||||
|
is grouped by.
|
||||||
|
|
||||||
|
You can then use count_by or group_by on the Tokens
|
||||||
|
for further processing. Calling group_by and then asking the length
|
||||||
|
of the Tokens objects is equivalent to count_by, but somewhat slower.
|
||||||
|
'''
|
||||||
|
# Implementation here is working around some of the constraints in
|
||||||
|
# Cython about what type of thing can go in what type of container.
|
||||||
|
# Long story short, it's pretty hard to get a Python object like
|
||||||
|
# Tokens into a vector or array. If we really need this to run faster,
|
||||||
|
# we can be tricky and get the Python list access out of the loop. What
|
||||||
|
# we'd do is store pointers to the underlying vectors.
|
||||||
|
# So far, speed isn't mattering here.
|
||||||
|
cdef dict indices = {}
|
||||||
|
cdef list groups = []
|
||||||
|
cdef list names = []
|
||||||
|
cdef list hashes = []
|
||||||
|
|
||||||
|
cdef StringHash key
|
||||||
|
cdef Lexeme_addr t
|
||||||
|
for t in self.vctr[0]:
|
||||||
|
key = attr_of(t, attr)
|
||||||
|
if key in indices:
|
||||||
|
groups[indices[key]].append(t)
|
||||||
|
else:
|
||||||
|
indices[key] = len(groups)
|
||||||
|
groups.append(Tokens(self.lang))
|
||||||
|
names.append(self.lang.unhash(key))
|
||||||
|
hashes.append(key)
|
||||||
|
groups[-1].append(t)
|
||||||
|
return names, hashes, groups
|
||||||
|
|
||||||
|
cpdef dict count_by(self, StringAttr attr):
|
||||||
counts = {}
|
counts = {}
|
||||||
cdef Lexeme_addr t
|
cdef Lexeme_addr t
|
||||||
cdef Lexeme* word
|
cdef StringHash key
|
||||||
for t in self.vctr[0]:
|
for t in self.vctr[0]:
|
||||||
word = <Lexeme*>t
|
key = attr_of(t, attr)
|
||||||
if word.lex not in counts:
|
if key not in counts:
|
||||||
counts[word.lex] = 0
|
counts[key] = 0
|
||||||
counts[word.lex] += 1
|
counts[key] += 1
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
35
tests/test_group_by.py
Normal file
35
tests/test_group_by.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy import en
|
||||||
|
from spacy.lexeme import lex_of
|
||||||
|
|
||||||
|
from spacy import SIC, LEX, NORM, SHAPE, LAST3
|
||||||
|
|
||||||
|
|
||||||
|
def test_group_by_lex():
|
||||||
|
tokens = en.tokenize("I like the red one and I like the blue one")
|
||||||
|
names, hashes, groups = tokens.group_by(LEX)
|
||||||
|
|
||||||
|
assert len(groups[0]) == 2
|
||||||
|
assert en.unhash(lex_of(groups[0][0])) == 'I'
|
||||||
|
assert names[0] == 'I'
|
||||||
|
assert len(groups[1]) == 2
|
||||||
|
assert en.unhash(lex_of(groups[1][0])) == 'like'
|
||||||
|
assert names[1] == "like"
|
||||||
|
assert len(groups[2]) == 2
|
||||||
|
assert len(groups[3]) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_group_by_last3():
|
||||||
|
tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck")
|
||||||
|
names, hashes, groups = tokens.group_by(LAST3)
|
||||||
|
|
||||||
|
assert len(groups[0]) == 1
|
||||||
|
assert en.unhash(lex_of(groups[0][0])) == 'I'
|
||||||
|
assert len(groups[1]) == 3
|
||||||
|
assert en.unhash(lex_of(groups[1][0])) == 'the'
|
||||||
|
assert len(groups[2]) == 2
|
||||||
|
assert len(groups[3]) == 2
|
||||||
|
assert len(groups[4]) == 1
|
16
tests/test_orth.py
Normal file
16
tests/test_orth.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.en import lookup, unhash
|
||||||
|
|
||||||
|
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of
|
||||||
|
from spacy.lexeme import shape_of
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def C3P0():
|
||||||
|
return lookup("C3P0")
|
||||||
|
|
||||||
|
|
||||||
|
def test_shape(C3P0):
|
||||||
|
assert unhash(shape_of(C3P0)) == "XdXd"
|
|
@ -48,3 +48,10 @@ def test_three_same_open(open_puncts):
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert unhash(lex_of(tokens[0])) == p
|
assert unhash(lex_of(tokens[0])) == p
|
||||||
assert unhash(lex_of(tokens[3])) == word_str
|
assert unhash(lex_of(tokens[3])) == word_str
|
||||||
|
|
||||||
|
|
||||||
|
def test_open_appostrophe():
|
||||||
|
string = "'The"
|
||||||
|
tokens = expand_chunk(lookup(string))
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert unhash(lex_of(tokens[0])) == "'"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user