* Merge string views feature branch

2025-09-18 18:12:45 +03:00 · 2014-07-23 17:37:48 +01:00 · 2014-07-23 17:37:48 +01:00 · 150cf6dd3b
commit 150cf6dd3b
parent 9fd085bf90 e80ec5525f
18 changed files with 346 additions and 584 deletions
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,177 +0,0 @@
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = _build
-
-# User-friendly check for sphinx-build
-ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
-$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
-endif
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  texinfo    to make Texinfo files"
-	@echo "  info       to make Texinfo files and run them through makeinfo"
-	@echo "  gettext    to make PO message catalogs"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  xml        to make Docutils-native XML files"
-	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spacy.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spacy.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/spacy"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spacy"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-latexpdfja:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through platex and dvipdfmx..."
-	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-texinfo:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo
-	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
-	@echo "Run \`make' in that directory to run these through makeinfo" \
-	      "(use \`make info' here to do that automatically)."
-
-info:
-	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
-	@echo "Running Texinfo files through makeinfo..."
-	make -C $(BUILDDIR)/texinfo info
-	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
-
-gettext:
-	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
-	@echo
-	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
-
-xml:
-	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
-	@echo
-	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
-
-pseudoxml:
-	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
-	@echo
-	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
--- a/docs/conf.py
+++ b/docs/conf.py
@ -1,269 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# spacy documentation build configuration file, created by
-# sphinx-quickstart on Thu Jul  3 21:54:08 2014.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys
-import os
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath('.'))
-sys.path.insert(0, os.path.join(os.path.abspath('.'), 'spacy'))
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.coverage',
-    'sphinx.ext.viewcode',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'spacy'
-copyright = u'2014, Matthew Honnibal'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.0'
-# The full version, including alpha/beta/rc tags.
-release = '0.0'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all
-# documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'default'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Add any extra paths that contain custom files (such as robots.txt or
-# .htaccess) here, relative to this directory. These files are copied
-# directly to the root of the documentation.
-#html_extra_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_domain_indices = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'spacydoc'
-
-
-# -- Options for LaTeX output ---------------------------------------------
-
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-  ('index', 'spacy.tex', u'spacy Documentation',
-   u'Matthew Honnibal', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# If true, show page references after internal links.
-#latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-#latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_domain_indices = True
-
-
-# -- Options for manual page output ---------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'spacy', u'spacy Documentation',
-     [u'Matthew Honnibal'], 1)
-]
-
-# If true, show URL addresses after external links.
-#man_show_urls = False
-
-
-# -- Options for Texinfo output -------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-  ('index', 'spacy', u'spacy Documentation',
-   u'Matthew Honnibal', 'spacy', 'One line description of project.',
-   'Miscellaneous'),
-]
-
-# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
-
-# If false, no module index is generated.
-#texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
-
-# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
-
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'http://docs.python.org/': None}
--- a/docs/index.rst
+++ b/docs/index.rst
@ -1,26 +0,0 @@
-.. spacy documentation master file, created by
-   sphinx-quickstart on Thu Jul  3 21:54:08 2014.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to spacy's documentation!
-=================================
-
-Contents:
-
-.. toctree::
-   :maxdepth: 2
-
-.. py:function:: enumerate(sequence[, start=0])
-
-   Return an iterator that yields tuples of an index and an item of the
-      *sequence*. (And so on.)
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,8 +1,17 @@
 from .lexeme import lex_of
 from .lexeme import sic_of

+from .tokens import Tokens

-__all__ = [lex_of, sic_of]
+# Don't know how to get the enum Python visible :(
+
+SIC = 0
+LEX = 1
+NORM = 2
+SHAPE = 3
+LAST3 = 4
+
+__all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3]


 """
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,8 +1,8 @@
 from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr

 from spacy.spacy cimport Language
 from spacy.tokens cimport Tokens
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -9,7 +9,6 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr

 from . import util
@ -37,12 +36,15 @@ cdef class English(spacy.Language):
 cdef bint is_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        # ...Unless we're at 0
+        return i == 0
+    if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
        return False
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
-    # Don't count periods as punct if the next char is a number
-    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+    # Don't count periods as punct if the next char is not whitespace
+    if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
        return False
    return not word[i].isalnum()

--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -2,8 +2,8 @@ from libcpp.vector cimport vector

 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme
+from spacy.spacy cimport Lexeme_addr
 from spacy.tokens cimport Tokens


--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -4,11 +4,11 @@ boldly assume no collisions.
 '''
 from __future__ import unicode_literals

+
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

-from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from spacy.spacy cimport Language
 from . import util
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -4,39 +4,53 @@ from libc.stdint cimport uint64_t
 ctypedef int ClusterID
 ctypedef uint64_t StringHash
 ctypedef size_t Lexeme_addr
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+

 from spacy.spacy cimport Language

+
+cdef struct Orthography:
+    StringHash last3
+    StringHash shape
+    StringHash norm
+
+    Py_UNICODE first
+    Bits8 flags
+
+
+cdef struct Distribution:
+    double prob
+    ClusterID cluster
+    Bits64 tagdict
+    Bits8 flags
+
+
 cdef struct Lexeme:
    StringHash sic # Hash of the original string
    StringHash lex # Hash of the word, with punctuation and clitics split off
-    StringHash normed # Hash of the normalized version of lex
-    StringHash last3 # Last 3 characters of the token
-    Py_UNICODE first # First character of the token

-    double prob # What is the log probability of the lex value?
-    ClusterID cluster # Brown cluster of the token
-
-    bint oft_upper # Is the lowered version of the lex value often in all caps?
-    bint oft_title # Is the lowered version of the lex value often title-cased?
+    Distribution* dist # Distribution info, lazy loaded
+    Orthography* orth  # Extra orthographic views
    Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens


-cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
-
-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
-                         int split, size_t length)
- 
-# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
-# has a conditional to pick out the correct item.  This allows safe iteration
-# over the Lexeme, via:
-# for field in range(LexAttr.n): get_attr(Lexeme*, field)
-cdef enum HashFields:
-    sic
-    lex
-    normed
-    cluster
-    n
+cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)


-#cdef uint64_t get_attr(Lexeme* word, HashFields attr)
+cdef enum StringAttr:
+    SIC
+    LEX
+    NORM
+    SHAPE
+    LAST3
+
+
+cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
+
+cpdef StringHash sic_of(size_t lex_id) except 0
+cpdef StringHash lex_of(size_t lex_id) except 0
+cpdef StringHash norm_of(size_t lex_id) except 0
+cpdef StringHash shape_of(size_t lex_id) except 0
+cpdef StringHash last3_of(size_t lex_id) except 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,49 +11,29 @@ from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector

+from spacy.spacy cimport StringHash

-cdef Lexeme* init_lexeme(Language lang, unicode string, StringHash hashed,
-                         int split, size_t length):
-    assert split <= length
-    cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+# Reiterate the enum, for python
+#SIC = StringAttr.sic
+#LEX = StringAttr.lex
+#NORM = StringAttr.norm
+#SHAPE = StringAttr.shape
+#LAST3 = StringAttr.last3

-    word.first = <Py_UNICODE>(string[0] if string else 0)
-    word.sic = hashed
-    
-    cdef unicode tail_string
-    cdef unicode lex 
-    if split != 0 and split < length:
-        lex = substr(string, 0, split, length)
-        tail_string = substr(string, split, length, length)
+
+cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0:
+    if attr == SIC:
+        return sic_of(lex_id)
+    elif attr == LEX:
+        return lex_of(lex_id)
+    elif attr == NORM:
+        return norm_of(lex_id)
+    elif attr == SHAPE:
+        return shape_of(lex_id)
+    elif attr == LAST3:
+        return last3_of(lex_id)
    else:
-        lex = string
-        tail_string = ''
-    assert lex
-    #cdef unicode normed = normalize_word_string(lex)
-    cdef unicode normed = '?'
-    cdef unicode last3 = substr(string, length - 3, length, length)
-
-    assert normed
-    assert len(normed)
-    
-    word.lex = lang.hash_string(lex, len(lex))
-    word.normed = lang.hash_string(normed, len(normed))
-    word.last3 = lang.hash_string(last3, len(last3))
-
-    lang.bacov[word.lex] = lex
-    lang.bacov[word.normed] = normed
-    lang.bacov[word.last3] = last3
-
-    # These are loaded later
-    word.prob = 0
-    word.cluster = 0
-    word.oft_upper = False
-    word.oft_title = False
-    
-    # Now recurse, and deal with the tail
-    if tail_string:
-        word.tail = <Lexeme*>lang.lookup(-1, tail_string, len(tail_string))
-    return word
+        raise StandardError


 cpdef StringHash sic_of(size_t lex_id) except 0:
@ -82,6 +62,35 @@ cpdef StringHash lex_of(size_t lex_id) except 0:
    return (<Lexeme*>lex_id).lex


+cpdef StringHash norm_of(size_t lex_id) except 0:
+    '''Access the `lex' field of the Lexeme pointed to by lex_id.
+
+    The lex field is the hash of the string you would expect to get back from
+    a standard tokenizer, i.e. the word with punctuation and other non-whitespace
+    delimited tokens split off.  The other fields refer to properties of the
+    string that the lex field stores a hash of, except sic and tail.
+
+    >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
+    [u'Hi', u'!', u'world']
+    '''
+    return (<Lexeme*>lex_id).orth.norm
+
+
+cpdef StringHash shape_of(size_t lex_id) except 0:
+    return (<Lexeme*>lex_id).orth.shape
+
+
+cpdef StringHash last3_of(size_t lex_id) except 0:
+    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
+    the hash of the last three characters of the word:
+
+    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
+    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
+    [u'llo', u'!']
+    '''
+    return (<Lexeme*>lex_id).orth.last3
+
+
 cpdef ClusterID cluster_of(size_t lex_id):
    '''Access the `cluster' field of the Lexeme pointed to by lex_id, which
    gives an integer representation of the cluster ID of the word, 
@ -98,7 +107,7 @@ cpdef ClusterID cluster_of(size_t lex_id):
    while "dapple" is totally different. On the other hand, "scalable" receives
    the same cluster ID as "pineapple", which is not what we'd like.
    '''
-    return (<Lexeme*>lex_id).cluster
+    return (<Lexeme*>lex_id).dist.cluster


 cpdef Py_UNICODE first_of(size_t lex_id):
@ -109,7 +118,7 @@ cpdef Py_UNICODE first_of(size_t lex_id):
    >>> unhash(first_of(lex_id))
    u'H'
    '''
-    return (<Lexeme*>lex_id).first
+    return (<Lexeme*>lex_id).orth.first


 cpdef double prob_of(size_t lex_id):
@ -122,18 +131,7 @@ cpdef double prob_of(size_t lex_id):
    >>> prob_of(lookup(u'world'))
    -20.10340371976182
    '''
-    pass
-
-
-cpdef StringHash last3_of(size_t lex_id):
-    '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
-    the hash of the last three characters of the word:
-
-    >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
-    >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
-    [u'llo', u'!']
-    '''
-    return (<Lexeme*>lex_id).last3
+    return (<Lexeme*>lex_id).dist.prob


 cpdef bint is_oft_upper(size_t lex_id):
@ -148,7 +146,12 @@ cpdef bint is_oft_upper(size_t lex_id):
    >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
    True
    '''
-    return (<Lexeme*>lex_id).oft_upper
+    return False
+    #cdef Lexeme* w = <Lexeme*>lex_id
+    #return w.orth.last3 if w.orth != NULL else 0
+
+
+    #return (<Lexeme*>lex_id).oft_upper


 cpdef bint is_oft_title(size_t lex_id):
@ -163,4 +166,5 @@ cpdef bint is_oft_title(size_t lex_id):
    >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
    True
    '''
-    return (<Lexeme*>lex_id).oft_title
+    return False
+    #return (<Lexeme*>lex_id).oft_title
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -7,16 +7,27 @@ from ext.sparsehash cimport dense_hash_map
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
-ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
-ctypedef int (*Splitter)(unicode word, size_t length)
+ctypedef dense_hash_map[StringHash, size_t] Vocab
+from spacy.lexeme cimport Lexeme
+
+from spacy.tokens cimport Tokens
+
+# Put these above import to avoid circular import problem
+ctypedef char Bits8
+ctypedef uint64_t Bits64
+ctypedef int ClusterID


 from spacy.lexeme cimport Lexeme
-from spacy.tokens cimport Tokens
+from spacy.lexeme cimport Distribution
+from spacy.lexeme cimport Orthography
+

 cdef class Language:
    cdef object name
    cdef Vocab* vocab
+    cdef Vocab* distri
+    cdef Vocab* ortho
    cdef dict bacov
    cdef int find_split(self, unicode word, size_t length)

@ -26,3 +37,8 @@ cdef class Language:
    
    cpdef Tokens tokenize(self, unicode text)
    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length)
+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length)
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex)
+
+ 
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -6,22 +6,65 @@ from libc.stdlib cimport calloc, free
 from ext.murmurhash cimport MurmurHash64A
 from ext.murmurhash cimport MurmurHash64B

-from spacy.lexeme cimport init_lexeme
+from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport BLANK_WORD

-from spacy.string_tools cimport is_whitespace
+from spacy.string_tools cimport substr
+

 from . import util
 from os import path
 cimport cython


+def get_normalized(unicode lex, size_t length):
+    if lex.isalpha() and lex.islower():
+        return lex
+    else:
+        return get_word_shape(lex, length)
+
+
+def get_word_shape(lex, length):
+    shape = ""
+    last = ""
+    shape_char = ""
+    seq = 0
+    for c in lex:
+        if c.isalpha():
+            if c.isupper():
+                shape_char = "X"
+            else:
+                shape_char = "x"
+        elif c.isdigit():
+            shape_char = "d"
+        else:
+            shape_char = c
+        if shape_char == last:
+            seq += 1
+        else:
+            seq = 0
+            last = shape_char
+        if seq < 3:
+            shape += shape_char
+    assert shape
+    return shape
+
+
+
+def set_orth_flags(lex, length):
+    return 0
+
+
 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
        self.vocab = new Vocab()
+        self.ortho = new Vocab()
+        self.distri = new Vocab()
        self.vocab[0].set_empty_key(0)
+        self.distri[0].set_empty_key(0)
+        self.ortho[0].set_empty_key(0)
        self.load_tokenization(util.read_tokenization(name))

    def load_tokenization(self, token_rules=None):
@ -80,7 +123,7 @@ cdef class Language:
        return <Lexeme_addr>word_ptr

    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
-        word = init_lexeme(self, string, hashed, split, length)
+        word = self.init_lexeme(string, hashed, split, length)
        self.vocab[0][hashed] = <Lexeme_addr>word
        self.bacov[hashed] = string
        return word   
@ -121,6 +164,55 @@ cdef class Language:
    cdef int find_split(self, unicode word, size_t length):
        return -1

+    cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed,
+                             int split, size_t length):
+        cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
+    
+        word.sic = hashed
+    
+        cdef unicode tail_string
+        cdef unicode lex 
+        if split != 0 and split < length:
+            lex = substr(string, 0, split, length)
+            tail_string = substr(string, split, length, length)
+        else:
+            lex = string
+            tail_string = ''
+    
+        word.lex = self.hash_string(lex, len(lex))
+        self.bacov[word.lex] = lex
+        word.orth = <Orthography*>self.ortho[0][word.lex]
+        if word.orth == NULL:
+            word.orth = self.init_orth(word.lex, lex)
+        word.dist = <Distribution*>self.distri[0][word.lex]
+    
+        # Now recurse, and deal with the tail
+        if tail_string:
+            word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string))
+        return word
+
+    cdef Orthography* init_orth(self, StringHash hashed, unicode lex):
+        cdef Orthography* orth = <Orthography*>calloc(1, sizeof(Orthography))
+        orth.first = <Py_UNICODE>lex[0]
+
+        cdef int length = len(lex)
+        
+        orth.flags = set_orth_flags(lex, length)
+        
+        cdef unicode last3 = substr(lex, length - 3, length, length)
+        cdef unicode norm = get_normalized(lex, length)
+        cdef unicode shape = get_word_shape(lex, length)
+
+        orth.last3 = self.hash_string(last3, len(last3))
+        orth.shape = self.hash_string(shape, len(shape))
+        orth.norm = self.hash_string(norm, len(norm))
+
+        self.bacov[orth.last3] = last3
+        self.bacov[orth.shape] = shape
+        self.bacov[orth.norm] = norm
+
+        self.ortho[0][hashed] = <size_t>orth
+        return orth


 cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
@ -137,7 +229,7 @@ cdef inline bint _is_whitespace(Py_UNICODE c) nogil:
 cpdef vector[size_t] expand_chunk(size_t addr) except *:
    cdef vector[size_t] tokens = vector[size_t]()
    word = <Lexeme*>addr
-    while word is not NULL:
+    while word != NULL:
        tokens.push_back(<size_t>word)
        word = word.tail
    return tokens
--- a/spacy/string_tools.pyx
+++ b/spacy/string_tools.pyx
@ -1,5 +1,6 @@
 # cython: profile=True

+
 cpdef unicode substr(unicode string, int start, int end, size_t length):
    if end >= length:
        end = -1
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,12 +1,9 @@
 from libcpp.vector cimport vector
-from spacy.lexeme cimport Lexeme
-from spacy.lexeme cimport Lexeme_addr
+from spacy.spacy cimport Lexeme_addr

 from cython.operator cimport dereference as deref
 from spacy.spacy cimport Language
-
-cdef enum Field:
-    lex
+from spacy.lexeme cimport StringAttr


 cdef class Tokens:
@ -17,5 +14,5 @@ cdef class Tokens:
    cpdef int append(self, Lexeme_addr token)
    cpdef int extend(self, Tokens other) except -1
    
-    cpdef list group_by(self, Field attr)
-    cpdef dict count_by(self, Field attr)
+    cpdef object group_by(self, StringAttr attr)
+    cpdef dict count_by(self, StringAttr attr)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,11 @@ from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as inc


+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport attr_of, norm_of, shape_of
+from spacy.spacy cimport StringHash
+
+
 cdef class Tokens:
    def __cinit__(self, Language lang):
        self.lang = lang
@ -32,17 +37,53 @@ cdef class Tokens:
        for el in other:
            self.append(el)

-    cpdef list group_by(self, Field attr):
-        pass
+    cpdef object group_by(self, StringAttr attr):
+        '''Group tokens that share the property attr into Tokens instances, and
+        return a list of them. Returns a tuple of three lists:
+        
+        (string names, hashes, tokens)

-    cpdef dict count_by(self, Field attr):
+        The lists are aligned, so the ith entry in string names is the string
+        that the ith entry in hashes unhashes to, which the Tokens instance
+        is grouped by.
+        
+        You can then use count_by or group_by on the Tokens
+        for further processing. Calling group_by and then asking the length
+        of the Tokens objects is equivalent to count_by, but somewhat slower.
+        '''
+        # Implementation here is working around some of the constraints in
+        # Cython about what type of thing can go in what type of container.
+        # Long story short, it's pretty hard to get a Python object like
+        # Tokens into a vector or array. If we really need this to run faster,
+        # we can be tricky and get the Python list access out of the loop. What
+        # we'd do is store pointers to the underlying vectors.
+        # So far, speed isn't mattering here.
+        cdef dict indices = {}
+        cdef list groups = []
+        cdef list names = []
+        cdef list hashes = []
+
+        cdef StringHash key
+        cdef Lexeme_addr t
+        for t in self.vctr[0]:
+            key = attr_of(t, attr)
+            if key in indices:
+                groups[indices[key]].append(t)
+            else:
+                indices[key] = len(groups)
+                groups.append(Tokens(self.lang))
+                names.append(self.lang.unhash(key))
+                hashes.append(key)
+                groups[-1].append(t)
+        return names, hashes, groups
+
+    cpdef dict count_by(self, StringAttr attr):
        counts = {}
        cdef Lexeme_addr t
-        cdef Lexeme* word
+        cdef StringHash key
        for t in self.vctr[0]:
-            word = <Lexeme*>t
-            if word.lex not in counts:
-                counts[word.lex] = 0
-            counts[word.lex] += 1
+            key = attr_of(t, attr)
+            if key not in counts:
+                counts[key] = 0
+            counts[key] += 1
        return counts
-
--- a/tests/test_group_by.py
+++ b/tests/test_group_by.py
@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy import en
+from spacy.lexeme import lex_of
+
+from spacy import SIC, LEX, NORM, SHAPE, LAST3
+
+
+def test_group_by_lex():
+    tokens = en.tokenize("I like the red one and I like the blue one")
+    names, hashes, groups = tokens.group_by(LEX)
+
+    assert len(groups[0]) == 2
+    assert en.unhash(lex_of(groups[0][0])) == 'I'
+    assert names[0] == 'I'
+    assert len(groups[1]) == 2
+    assert en.unhash(lex_of(groups[1][0])) == 'like'
+    assert names[1] == "like"
+    assert len(groups[2]) == 2
+    assert len(groups[3]) == 1
+
+
+def test_group_by_last3():
+    tokens = en.tokenize("I the blithe swarthy mate ate on the filthy deck")
+    names, hashes, groups = tokens.group_by(LAST3)
+
+    assert len(groups[0]) == 1
+    assert en.unhash(lex_of(groups[0][0])) == 'I'
+    assert len(groups[1]) == 3
+    assert en.unhash(lex_of(groups[1][0])) == 'the'
+    assert len(groups[2]) == 2
+    assert len(groups[3]) == 2
+    assert len(groups[4]) == 1
--- a/tests/test_orth.py
+++ b/tests/test_orth.py
@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy.en import lookup, unhash
+
+from spacy.lexeme import sic_of, lex_of, norm_of, shape_of, first_of
+from spacy.lexeme import shape_of
+
+@pytest.fixture
+def C3P0():
+    return lookup("C3P0")
+
+
+def test_shape(C3P0):
+    assert unhash(shape_of(C3P0)) == "XdXd"
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -48,3 +48,10 @@ def test_three_same_open(open_puncts):
        assert len(tokens) == 4
        assert unhash(lex_of(tokens[0])) == p
        assert unhash(lex_of(tokens[3])) == word_str
+
+
+def test_open_appostrophe():
+    string = "'The"
+    tokens = expand_chunk(lookup(string))
+    assert len(tokens) == 2
+    assert unhash(lex_of(tokens[0])) == "'"