Better TOC generation

2025-07-31 02:09:54 +03:00 · 2017-05-02 17:46:47 +03:00 · 2017-05-02 17:46:47 +03:00 · f1ab9b6179
commit f1ab9b6179
parent d1a0fe7ee8
5 changed files with 55 additions and 204 deletions
--- a/docs/toc.md
+++ b/docs/toc.md
@ -1,5 +1,4 @@
-Table of Contents
-=================
+# Table of Contents

   * [Overview](index.md#overview)
      * [Installation](index.md#installation)
@ -29,8 +28,10 @@ Table of Contents
      * [Working with materialized and alias fields](field_types.md#working-with-materialized-and-alias-fields)

   * [Table Engines](table_engines.md#table-engines)
-      * [Buffer Models](table_engines.md#buffer-models)
-      * [Data Replication](table_engines.md#data-replication)
+      * [Simple Engines](table_engines.md#simple-engines)
+      * [Engines in the MergeTree Family](table_engines.md#engines-in-the-mergetree-family)
+         * [Data Replication](table_engines.md#data-replication)
+      * [Buffer Engine](table_engines.md#buffer-engine)

   * [Schema Migrations](schema_migrations.md#schema-migrations)
      * [Writing Migrations](schema_migrations.md#writing-migrations)
@ -40,3 +41,4 @@ Table of Contents
      * [Partitions and Parts](system_models.md#partitions-and-parts)

   * [Contributing](contributing.md#contributing)
+
--- a/scripts/README.md
+++ b/scripts/README.md
@ -2,14 +2,14 @@ This directory contains various scripts for use while developing.

 generate_toc
 ------------
-Generates the table of contents (toc.md)
+Generates the table of contents (toc.md). Requires Pandoc.
 Usage:
    cd docs
    ../scripts/generate_toc.sh


-gh-md-toc
---------
+html_to_markdown_toc.py
+-----------------------
 Used by generate_toc.


--- a/scripts/generate_toc.sh
+++ b/scripts/generate_toc.sh
@ -1,13 +1,16 @@
-echo "Table of Contents" > toc.md
-echo "=================" >> toc.md

-../scripts/gh-md-toc \
-    index.md \
-    models_and_databases.md \
-    querysets.md \
-    field_types.md \
-    table_engines.md \
-    schema_migrations.md \
-    system_models.md \
-    contributing.md \
-    >> toc.md
+generate_one() {
+    # Converts Markdown to HTML using Pandoc, and then extracts the header tags
+    pandoc "$1" | python "../scripts/html_to_markdown_toc.py" "$1" >> toc.md
+}
+
+printf "# Table of Contents\n\n" > toc.md
+
+generate_one "index.md"
+generate_one "models_and_databases.md"
+generate_one "querysets.md"
+generate_one "field_types.md"
+generate_one "table_engines.md"
+generate_one "schema_migrations.md"
+generate_one "system_models.md"
+generate_one "contributing.md"
--- a/scripts/gh-md-toc
+++ b/scripts/gh-md-toc
@ -1,185 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Source: https://github.com/ekalinin/github-markdown-toc
-#
-# Steps:
-#
-#  1. Download corresponding html file for some README.md:
-#       curl -s $1
-#
-#  2. Discard rows where no substring 'user-content-' (github's markup):
-#       awk '/user-content-/ { ...
-#
-#  3.1 Get last number in each row like ' ... </span></a>sitemap.js</h1'.
-#      It's a level of the current header:
-#       substr($0, length($0), 1)
-#
-#  3.2 Get level from 3.1 and insert corresponding number of spaces before '*':
-#       sprintf("%*s", substr($0, length($0), 1)*2, " ")
-#
-#  4. Find head's text and insert it inside "* [ ... ]":
-#       substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)
-#
-#  5. Find anchor and insert it inside "(...)":
-#       substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8)
-#
-
-gh_toc_version="0.4.8"
-
-gh_user_agent="gh-md-toc v$gh_toc_version"
-
-#
-# Download rendered into html README.md by its url.
-#
-#
-gh_toc_load() {
-    local gh_url=$1
-
-    if type curl &>/dev/null; then
-        curl --user-agent "$gh_user_agent" -s "$gh_url"
-    elif type wget &>/dev/null; then
-        wget --user-agent="$gh_user_agent" -qO- "$gh_url"
-    else
-        echo "Please, install 'curl' or 'wget' and try again."
-        exit 1
-    fi
-}
-
-#
-# Converts local md file into html by GitHub
-#
-# ➥ curl -X POST --data '{"text": "Hello world github/linguist#1 **cool**, and #1!"}' https://api.github.com/markdown
-# <p>Hello world github/linguist#1 <strong>cool</strong>, and #1!</p>'"
-gh_toc_md2html() {
-    local gh_file_md=$1
-    curl -s --user-agent "$gh_user_agent" \
-        --data-binary @"$gh_file_md" -H "Content-Type:text/plain" \
-        https://api.github.com/markdown/raw
-}
-
-#
-# Is passed string url
-#
-gh_is_url() {
-    if [[ $1 == https* || $1 == http* ]]; then
-        echo "yes"
-    else
-        echo "no"
-    fi
-}
-
-#
-# TOC generator
-#
-gh_toc(){
-    local gh_src=$1
-    local gh_src_copy=$1
-    local gh_ttl_docs=$2
-
-    if [ "$gh_src" = "" ]; then
-        echo "Please, enter URL or local path for a README.md"
-        exit 1
-    fi
-
-
-    # Show "TOC" string only if working with one document
-    if [ "$gh_ttl_docs" = "1" ]; then
-
-        echo "Table of Contents"
-        echo "================="
-        echo ""
-        gh_src_copy=""
-
-    fi
-
-    if [ "$(gh_is_url "$gh_src")" == "yes" ]; then
-        gh_toc_load "$gh_src" | gh_toc_grab "$gh_src_copy"
-    else
-        gh_toc_md2html "$gh_src" | gh_toc_grab "$gh_src_copy"
-    fi
-}
-
-#
-# Grabber of the TOC from rendered html
-#
-# $1 — a source url of document.
-# It's need if TOC is generated for multiple documents.
-#
-gh_toc_grab() {
-    # if closed <h[1-6]> is on the new line, then move it on the prev line
-    # for example:
-    #   was: The command <code>foo1</code>
-    #        </h1>
-    #   became: The command <code>foo1</code></h1>
-    sed -e ':a' -e 'N' -e '$!ba' -e 's/\n<\/h/<\/h/g' |
-    # find strings that corresponds to template
-    grep -E -o '<a\s*id="user-content-[^"]*".*</h[1-6]' |
-    # remove code tags
-    sed 's/<code>//' | sed 's/<\/code>//' |
-    # now all rows are like:
-    #   <a id="user-content-..." href="..."><span ...></span></a> ... </h1
-    # format result line
-    #   * $0 — whole string
-    echo -e "$(awk -v "gh_url=$1" '{
-    print sprintf("%*s", substr($0, length($0), 1)*3, " ") "* [" substr($0, match($0, /a>.*<\/h/)+2, RLENGTH-5)"](" gh_url substr($0, match($0, "href=\"[^\"]+?\" ")+6, RLENGTH-8) ")"}' | sed 'y/+/ /; s/%/\\x/g')"
-}
-
-#
-# Returns filename only from full path or url
-#
-gh_toc_get_filename() {
-    echo "${1##*/}"
-}
-
-#
-# Options hendlers
-#
-gh_toc_app() {
-    local app_name="gh-md-toc"
-
-    if [ "$1" = '--help' ] || [ $# -eq 0 ] ; then
-        echo "GitHub TOC generator ($app_name): $gh_toc_version"
-        echo ""
-        echo "Usage:"
-        echo "  $app_name src [src]     Create TOC for a README file (url or local path)"
-        echo "  $app_name -             Create TOC for markdown from STDIN"
-        echo "  $app_name --help        Show help"
-        echo "  $app_name --version     Show version"
-        return
-    fi
-
-    if [ "$1" = '--version' ]; then
-        echo "$gh_toc_version"
-        return
-    fi
-
-    if [ "$1" = "-" ]; then
-        if [ -z "$TMPDIR" ]; then
-            TMPDIR="/tmp"
-        elif [ -n "$TMPDIR" -a ! -d "$TMPDIR" ]; then
-            mkdir -p "$TMPDIR"
-        fi
-        local gh_tmp_md
-        gh_tmp_md=$(mktemp $TMPDIR/tmp.XXXXXX)
-        while read input; do
-            echo "$input" >> "$gh_tmp_md"
-        done
-        gh_toc_md2html "$gh_tmp_md" | gh_toc_grab ""
-        return
-    fi
-
-    for md in "$@"
-    do
-        echo ""
-        gh_toc "$md" "$#"
-    done
-
-    #echo ""
-    #echo "Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)"
-}
-
-#
-# Entry point
-#
-gh_toc_app "$@"
--- a/scripts/html_to_markdown_toc.py
+++ b/scripts/html_to_markdown_toc.py
@ -0,0 +1,31 @@
+from HTMLParser import HTMLParser
+import sys
+
+
+HEADER_TAGS = ('h1', 'h2', 'h3')
+
+
+class HeadersToMarkdownParser(HTMLParser):
+
+    inside = None
+    text = ''
+
+    def handle_starttag(self, tag, attrs):
+        if tag.lower() in HEADER_TAGS:
+            self.inside = tag
+
+    def handle_endtag(self, tag):
+        if tag.lower() in HEADER_TAGS:
+            indent = '   ' * int(self.inside[1])
+            fragment = self.text.lower().replace(' ', '-')
+            print '%s* [%s](%s#%s)' % (indent, self.text, sys.argv[1], fragment)
+            self.inside = None
+            self.text = ''
+
+    def handle_data(self, data):
+        if self.inside:
+            self.text += data
+
+
+HeadersToMarkdownParser.feed(sys.stdin.read())
+print