* Merge string views feature branch

This commit is contained in:
Matthew Honnibal 2014-07-23 17:37:48 +01:00
commit 150cf6dd3b
18 changed files with 346 additions and 584 deletions

View File

@ -1,177 +0,0 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spacy.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spacy.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/spacy"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spacy"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."

View File

@ -1,269 +0,0 @@
# -*- coding: utf-8 -*-
#
# spacy documentation build configuration file, created by
# sphinx-quickstart on Thu Jul 3 21:54:08 2014.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.join(os.path.abspath('.'), 'spacy'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.coverage',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'spacy'
copyright = u'2014, Matthew Honnibal'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.0'
# The full version, including alpha/beta/rc tags.
release = '0.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'spacydoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
('index', 'spacy.tex', u'spacy Documentation',
u'Matthew Honnibal', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'spacy', u'spacy Documentation',
[u'Matthew Honnibal'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'spacy', u'spacy Documentation',
u'Matthew Honnibal', 'spacy', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'http://docs.python.org/': None}

View File

@ -1,26 +0,0 @@
.. spacy documentation master file, created by
sphinx-quickstart on Thu Jul 3 21:54:08 2014.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to spacy's documentation!
=================================
Contents:
.. toctree::
:maxdepth: 2
.. py:function:: enumerate(sequence[, start=0])
Return an iterator that yields tuples of an index and an item of the
*sequence*. (And so on.)
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@ -1,8 +1,17 @@
from .lexeme import lex_of
from .lexeme import sic_of
from .tokens import Tokens
__all__ = [lex_of, sic_of]
# Don't know how to get the enum Python visible :(
SIC = 0
LEX = 1
NORM = 2
SHAPE = 3
LAST3 = 4
__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]
"""

View File

@ -1,8 +1,8 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.spacy cimport Language
from spacy.tokens cimport Tokens

View File

@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from . import util
@ -37,12 +36,15 @@ cdef class English(spacy.Language):
cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
# ...Unless we're at 0
return i == 0
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
return False
# Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False
# Don't count periods as punct if the next char is a number
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
# Don't count periods as punct if the next char is not whitespace
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
return False
return not word[i].isalnum()

View File

@ -2,8 +2,8 @@ from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Language
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.tokens cimport Tokens

View File

@ -4,11 +4,11 @@ boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from spacy.spacy cimport Language
from . import util

View File

@ -4,39 +4,53 @@ from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr
ctypedef char Bits8
ctypedef uint64_t Bits64
from spacy.spacy cimport Language
cdef struct Orthography:
StringHash last3
StringHash shape
StringHash norm
Py_UNICODE first
Bits8 flags
cdef struct Distribution:
double prob
ClusterID cluster
Bits64 tagdict
Bits8 flags
cdef struct Lexeme:
StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off
StringHash normed # Hash of the normalized version of lex
StringHash last3 # Last 3 characters of the token
Py_UNICODE first # First character of the token
double prob # What is the log probability of the lex value?
ClusterID cluster # Brown cluster of the token
bint oft_upper # Is the lowered version of the lex value often in all caps?
bint oft_title # Is the lowered version of the lex value often title-cased?
Distribution* dist # Distribution info, lazy loaded
Orthography* orth # Extra orthographic views
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length)
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via:
# for field in range(LexAttr.n): get_attr(Lexeme*, field)
cdef enum HashFields:
sic
lex
normed
cluster
n
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
cdef enum StringAttr:
SIC
LEX
NORM
SHAPE
LAST3
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
cpdef StringHash sic_of(size_t lex_id) except 0
cpdef StringHash lex_of(size_t lex_id) except 0
cpdef StringHash norm_of(size_t lex_id) except 0
cpdef StringHash shape_of(size_t lex_id) except 0
cpdef StringHash last3_of(size_t lex_id) except 0

View File

@ -11,49 +11,29 @@ from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
int split, size_t length):
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
# Reiterate the enum, for python
#SIC = StringAttr.sic
#LEX = StringAttr.lex
#NORM = StringAttr.norm
#SHAPE = StringAttr.shape
#LAST3 = StringAttr.last3
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
if attr == SIC:
return sic_of(lex_id)
elif attr == LEX:
return lex_of(lex_id)
elif attr == NORM:
return norm_of(lex_id)
elif attr == SHAPE:
return shape_of(lex_id)
elif attr == LAST3:
return last3_of(lex_id)
else:
lex = string
tail_string = ''
assert lex
#cdef unicode normed = normalize_word_string(lex)
cdef unicode normed = '?'
cdef unicode last3 = substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = lang.hash_string(lex, len(lex))
word.normed = lang.hash_string(normed, len(normed))
word.last3 = lang.hash_string(last3, len(last3))
lang.bacov[word.lex] = lex
lang.bacov[word.normed] = normed
lang.bacov[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
return word
raise StandardError
cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +62,35 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).lex
cpdef StringHash norm_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).orth.norm
cpdef StringHash shape_of(size_t lex_id) except 0:
return (<Lexeme*>lex_id).orth.shape
cpdef StringHash last3_of(size_t lex_id) except 0:
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).orth.last3
cpdef ClusterID cluster_of(size_t lex_id):
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word,
@ -98,7 +107,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
'''
return (<Lexeme*>lex_id).cluster
return (<Lexeme*>lex_id).dist.cluster
cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +118,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
>>> unhash(first_of(lex_id))
u'H'
'''
return (<Lexeme*>lex_id).first
return (<Lexeme*>lex_id).orth.first
cpdef double prob_of(size_t lex_id):
@ -122,18 +131,7 @@ cpdef double prob_of(size_t lex_id):
>>> prob_of(lookup(u'world'))
-20.10340371976182
'''
pass
cpdef StringHash last3_of(size_t lex_id):
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).last3
return (<Lexeme*>lex_id).dist.prob
cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +146,12 @@ cpdef bint is_oft_upper(size_t lex_id):
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).oft_upper
return False
#cdef Lexeme* w = <Lexeme*>lex_id
#return w.orth.last3 if w.orth != NULL else 0
#return (<Lexeme*>lex_id).oft_upper
cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +166,5 @@ cpdef bint is_oft_title(size_t lex_id):
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).oft_title
return False
#return (<Lexeme*>lex_id).oft_title

View File

@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
ctypedef int (*Splitter)(unicode word, size_t length)
ctypedef dense_hash_map[StringHash, size_t] Vocab
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
# Put these above import to avoid circular import problem
ctypedef char Bits8
ctypedef uint64_t Bits64
ctypedef int ClusterID
from spacy.lexeme cimport Lexeme
from spacy.tokens cimport Tokens
from spacy.lexeme cimport Distribution
from spacy.lexeme cimport Orthography
cdef class Language:
cdef object name
cdef Vocab* vocab
cdef Vocab* distri
cdef Vocab* ortho
cdef dict bacov
cdef int find_split(self, unicode word, size_t length)
@ -26,3 +37,8 @@ cdef class Language:
cpdef Tokens tokenize(self, unicode text)
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length)
cdef Orthography* init_orth(self, StringHash hashed, unicode lex)

View File

@ -6,22 +6,65 @@ from libc.stdlib cimport calloc, free
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport init_lexeme
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport is_whitespace
from spacy.string_tools cimport substr
from . import util
from os import path
cimport cython
def get_normalized(unicode lex, size_t length):
if lex.isalpha() and lex.islower():
return lex
else:
return get_word_shape(lex, length)
def get_word_shape(lex, length):
shape = ""
last = ""
shape_char = ""
seq = 0
for c in lex:
if c.isalpha():
if c.isupper():
shape_char = "X"
else:
shape_char = "x"
elif c.isdigit():
shape_char = "d"
else:
shape_char = c
if shape_char == last:
seq += 1
else:
seq = 0
last = shape_char
if seq < 3:
shape += shape_char
assert shape
return shape
def set_orth_flags(lex, length):
return 0
cdef class Language:
def __cinit__(self, name):
self.name = name
self.bacov = {}
self.vocab = new Vocab()
self.ortho = new Vocab()
self.distri = new Vocab()
self.vocab[0].set_empty_key(0)
self.distri[0].set_empty_key(0)
self.ortho[0].set_empty_key(0)
self.load_tokenization(util.read_tokenization(name))
def load_tokenization(self, token_rules=None):
@ -80,7 +123,7 @@ cdef class Language:
return <Lexeme_addr>word_ptr
cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
word = init_lexeme(self, string, hashed, split, length)
word = self.init_lexeme(string, hashed, split, length)
self.vocab[0][hashed] = <Lexeme_addr>word
self.bacov[hashed] = string
return word
@ -121,6 +164,55 @@ cdef class Language:
cdef int find_split(self, unicode word, size_t length):
return -1
cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
int split, size_t length):
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
word.lex = self.hash_string(lex, len(lex))
self.bacov[word.lex] = lex
word.orth = <Orthography*>self.ortho[0][word.lex]
if word.orth == NULL:
word.orth = self.init_orth(word.lex, lex)
word.dist = <Distribution*>self.distri[0][word.lex]
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
return word
cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
orth.first = <Py_UNICODE>lex[0]
cdef int length = len(lex)
orth.flags = set_orth_flags(lex, length)
cdef unicode last3 = substr(lex, length - 3, length, length)
cdef unicode norm = get_normalized(lex, length)
cdef unicode shape = get_word_shape(lex, length)
orth.last3 = self.hash_string(last3, len(last3))
orth.shape = self.hash_string(shape, len(shape))
orth.norm = self.hash_string(norm, len(norm))
self.bacov[orth.last3] = last3
self.bacov[orth.shape] = shape
self.bacov[orth.norm] = norm
self.ortho[0][hashed] = <size_t>orth
return orth
cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +229,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]()
word = <Lexeme*>addr
while word is not NULL:
while word != NULL:
tokens.push_back(<size_t>word)
word = word.tail
return tokens

View File

@ -1,5 +1,6 @@
# cython: profile=True
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1

View File

@ -1,12 +1,9 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
from spacy.spacy cimport Lexeme_addr
from cython.operator cimport dereference as deref
from spacy.spacy cimport Language
cdef enum Field:
lex
from spacy.lexeme cimport StringAttr
cdef class Tokens:
@ -17,5 +14,5 @@ cdef class Tokens:
cpdef int append(self, Lexeme_addr token)
cpdef int extend(self, Tokens other) except -1
cpdef list group_by(self, Field attr)
cpdef dict count_by(self, Field attr)
cpdef object group_by(self, StringAttr attr)
cpdef dict count_by(self, StringAttr attr)

View File

@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as inc
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport attr_of, norm_of, shape_of
from spacy.spacy cimport StringHash
cdef class Tokens:
def __cinit__(self, Language lang):
self.lang = lang
@ -32,17 +37,53 @@ cdef class Tokens:
for el in other:
self.append(el)
cpdef list group_by(self, Field attr):
pass
cpdef object group_by(self, StringAttr attr):
'''Group tokens that share the property attr into Tokens instances, and
return a list of them. Returns a tuple of three lists:
(string names, hashes, tokens)
cpdef dict count_by(self, Field attr):
The lists are aligned, so the ith entry in string names is the string
that the ith entry in hashes unhashes to, which the Tokens instance
is grouped by.
You can then use count_by or group_by on the Tokens
for further processing. Calling group_by and then asking the length
of the Tokens objects is equivalent to count_by, but somewhat slower.
'''
# Implementation here is working around some of the constraints in
# Cython about what type of thing can go in what type of container.
# Long story short, it's pretty hard to get a Python object like
# Tokens into a vector or array. If we really need this to run faster,
# we can be tricky and get the Python list access out of the loop. What
# we'd do is store pointers to the underlying vectors.
# So far, speed isn't mattering here.
cdef dict indices = {}
cdef list groups = []
cdef list names = []
cdef list hashes = []
cdef StringHash key
cdef Lexeme_addr t
for t in self.vctr[0]:
key = attr_of(t, attr)
if key in indices:
groups[indices[key]].append(t)
else:
indices[key] = len(groups)
groups.append(Tokens(self.lang))
names.append(self.lang.unhash(key))
hashes.append(key)
groups[-1].append(t)
return names, hashes, groups
cpdef dict count_by(self, StringAttr attr):
counts = {}
cdef Lexeme_addr t
cdef Lexeme* word
cdef StringHash key
for t in self.vctr[0]:
word = <Lexeme*>t
if word.lex not in counts:
counts[word.lex] = 0
counts[word.lex] += 1
key = attr_of(t, attr)
if key not in counts:
counts[key] = 0
counts[key] += 1
return counts

35
tests/test_group_by.py Normal file
View File

@ -0,0 +1,35 @@
from __future__ import unicode_literals
import pytest
from spacy import en
from spacy.lexeme import lex_of
from spacy import SIC, LEX, NORM, SHAPE, LAST3
def test_group_by_lex():
tokens = en.tokenize("I like the red one and I like the blue one")
names, hashes, groups = tokens.group_by(LEX)
assert len(groups[0]) == 2
assert en.unhash(lex_of(groups[0][0])) == 'I'
assert names[0] == 'I'
assert len(groups[1]) == 2
assert en.unhash(lex_of(groups[1][0])) == 'like'
assert names[1] == "like"
assert len(groups[2]) == 2
assert len(groups[3]) == 1
def test_group_by_last3():
tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck")
names, hashes, groups = tokens.group_by(LAST3)
assert len(groups[0]) == 1
assert en.unhash(lex_of(groups[0][0])) == 'I'
assert len(groups[1]) == 3
assert en.unhash(lex_of(groups[1][0])) == 'the'
assert len(groups[2]) == 2
assert len(groups[3]) == 2
assert len(groups[4]) == 1

16
tests/test_orth.py Normal file
View File

@ -0,0 +1,16 @@
from __future__ import unicode_literals
import pytest
from spacy.en import lookup, unhash
from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of
from spacy.lexeme import shape_of
@pytest.fixture
def C3P0():
return lookup("C3P0")
def test_shape(C3P0):
assert unhash(shape_of(C3P0)) == "XdXd"

View File

@ -48,3 +48,10 @@ def test_three_same_open(open_puncts):
assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[3])) == word_str
def test_open_appostrophe():
string = "'The"
tokens = expand_chunk(lookup(string))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "'"