diff mbox series

[1/3] Docs: An initial automarkup extension for sphinx

Message ID 20190621235159.6992-2-corbet@lwn.net
State New
Headers show
Series docs: function automarkup, now with 80% fewer regexes! | expand

Commit Message

Jonathan Corbet June 21, 2019, 11:51 p.m. UTC
Rather than fill our text files with :c:func:`function()` syntax, just do
the markup via a hook into the sphinx build process.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>

---
 Documentation/conf.py              |  3 +-
 Documentation/sphinx/automarkup.py | 80 ++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/sphinx/automarkup.py

-- 
2.21.0

Comments

Mauro Carvalho Chehab June 22, 2019, 1 a.m. UTC | #1
Em Fri, 21 Jun 2019 17:51:57 -0600
Jonathan Corbet <corbet@lwn.net> escreveu:

> Rather than fill our text files with :c:func:`function()` syntax, just do

> the markup via a hook into the sphinx build process.



Didn't test it, but it sounds a way nicer than the past version!

> Signed-off-by: Jonathan Corbet <corbet@lwn.net>

> ---

>  Documentation/conf.py              |  3 +-

>  Documentation/sphinx/automarkup.py | 80 ++++++++++++++++++++++++++++++

>  2 files changed, 82 insertions(+), 1 deletion(-)

>  create mode 100644 Documentation/sphinx/automarkup.py

> 

> diff --git a/Documentation/conf.py b/Documentation/conf.py

> index 7ace3f8852bd..a502baecbb85 100644

> --- a/Documentation/conf.py

> +++ b/Documentation/conf.py

> @@ -34,7 +34,8 @@ needs_sphinx = '1.3'

>  # Add any Sphinx extension module names here, as strings. They can be

>  # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom

>  # ones.

> -extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']

> +extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',

> +              'kfigure', 'sphinx.ext.ifconfig', 'automarkup']

>  

>  # The name of the math extension changed on Sphinx 1.4

>  if (major == 1 and minor > 3) or (major > 1):

> diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py

> new file mode 100644

> index 000000000000..14b09b5d145e

> --- /dev/null

> +++ b/Documentation/sphinx/automarkup.py

> @@ -0,0 +1,80 @@

> +# SPDX-License-Identifier: GPL-2.0

> +# Copyright 2019 Jonathan Corbet <corbet@lwn.net>

> +#

> +# Apply kernel-specific tweaks after the initial document processing

> +# has been done.

> +#

> +from docutils import nodes

> +from sphinx import addnodes

> +import re

> +

> +#

> +# Regex nastiness.  Of course.

> +# Try to identify "function()" that's not already marked up some

> +# other way.  Sphinx doesn't like a lot of stuff right after a

> +# :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last

> +# bit tries to restrict matches to things that won't create trouble.

> +#

> +RE_function = re.compile(r'([\w_][\w\d_]+\(\))')

> +

> +#

> +# The DVB docs create references for these basic system calls, leading

> +# to lots of confusing links.  So just don't link them.

> +#

> +Skipfuncs = [ 'open', 'close', 'write' ]


and yeah, of course, if there's something weird, it has to be at
the media docs :-)

Btw, if I'm not mistaken, we do the same for ioctl.


I'm wandering if this could also handle the Documentation/* auto-replace.

The patch snipped I did (against your past version is enclosed).


diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py
index 39c8f4d5af82..60dad596c790 100644
--- a/Documentation/sphinx/automarkup.py
+++ b/Documentation/sphinx/automarkup.py
@@ -9,6 +9,7 @@
 from __future__ import print_function
 import re
 import sphinx
+#import sys		# Just for debug
 
 #
 # Regex nastiness.  Of course.
@@ -31,10 +32,26 @@ RE_literal = re.compile(r'^(\s*)(.*::\s*|\.\.\s+code-block::.*)$')
 #
 RE_whitesp = re.compile(r'^(\s*)')
 
+#
+# Get a documentation reference
+#
+RE_doc_links = re.compile(r'\bDocumentation/([\w\d\-\_\/]+)\.rst\b')
+
+#
+# Doc link false-positives
+#
+RE_false_doc_links = re.compile(r':ref:`\s*Documentation/[\w\d\-\_\/]+\.rst')
+
 def MangleFile(app, docname, text):
     ret = [ ]
     previous = ''
     literal = False
+
+    rel_dir = ''
+
+    for depth in range(0, docname.count('/')):
+        rel_dir += "../"
+
     for line in text[0].split('\n'):
         #
         # See if we might be ending a literal block, as denoted by
@@ -63,7 +80,18 @@ def MangleFile(app, docname, text):
         # Normal line - perform substitutions.
         #
         else:
-            ret.append(RE_function.sub(r'\1:c:func:`\2`\3', line))
+#            new_line = RE_function.sub(r'\1:c:func:`\2`\3', line)
+            new_line = line
+
+            if not RE_false_doc_links.search(new_line):
+                new_line = RE_doc_links.sub(r':doc:`' + rel_dir + r'\1`', new_line)
+
+ #           # Just for debug - should be removed on production
+ #           if new_line != line:
+ #               print ("===>" + new_line, file=sys.stderr)
+
+            ret.append(new_line)
+
         #
         # Might we be starting a literal block?  If so make note of
         # the fact.


Thanks,
Mauro
Jonathan Corbet June 22, 2019, 2:43 p.m. UTC | #2
On Fri, 21 Jun 2019 22:00:46 -0300
Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:

> > +#

> > +# The DVB docs create references for these basic system calls, leading

> > +# to lots of confusing links.  So just don't link them.

> > +#

> > +Skipfuncs = [ 'open', 'close', 'write' ]  

> 

> and yeah, of course, if there's something weird, it has to be at

> the media docs :-)

> 

> Btw, if I'm not mistaken, we do the same for ioctl.


So that's actually interesting.  In, for example,
Documentation/media/uapi/v4l/func-ioctl.rst, you see something that looks
like this:

> .. c:function:: int ioctl( int fd, int request, void *argp )

>     :name: v4l2-ioctl


Some digging around didn't turn up any documentation for :name:, but it
seems to prevent ioctl() from going into the list of functions that can be
cross-referenced.  I wonder if the same should be done for the others?  I
think that would be better than putting a special-case hack into the
toolchain.

> I'm wandering if this could also handle the Documentation/* auto-replace.


I think it's the obvious place for it, yes.  Let's make sure I haven't
badly broken anything with the existing change first, though :)

Thanks,

jon
Mauro Carvalho Chehab June 22, 2019, 5:46 p.m. UTC | #3
Em Sat, 22 Jun 2019 08:43:46 -0600
Jonathan Corbet <corbet@lwn.net> escreveu:

> On Fri, 21 Jun 2019 22:00:46 -0300

> Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:

> 

> > > +#

> > > +# The DVB docs create references for these basic system calls, leading

> > > +# to lots of confusing links.  So just don't link them.

> > > +#

> > > +Skipfuncs = [ 'open', 'close', 'write' ]    

> > 

> > and yeah, of course, if there's something weird, it has to be at

> > the media docs :-)

> > 

> > Btw, if I'm not mistaken, we do the same for ioctl.  

> 

> So that's actually interesting.  In, for example,

> Documentation/media/uapi/v4l/func-ioctl.rst, you see something that looks

> like this:

> 

> > .. c:function:: int ioctl( int fd, int request, void *argp )

> >     :name: v4l2-ioctl  

> 

> Some digging around didn't turn up any documentation for :name:, but it

> seems to prevent ioctl() from going into the list of functions that can be

> cross-referenced. 


It took me a while to discover this way to be able to re-define the
name of a symbol at the C domain, but I'm pretty sure I read this
somewhere at the Sphinx docs (or perhaps on some bug track or Stack
Overflow).

I don't remember exactly where I get it, but I guess it is related to
this:

	http://docutils.sourceforge.net/docs/howto/rst-roles.html

> I wonder if the same should be done for the others?


Sure.

> I think that would be better than putting a special-case hack into the

> toolchain.


Agreed. As you're doing this, if you prefer, feel free to send the
patches to media docs. Otherwise, I'll seek for some time next week.

> 

> > I'm wandering if this could also handle the Documentation/* auto-replace.  

> 

> I think it's the obvious place for it, yes.  Let's make sure I haven't

> badly broken anything with the existing change first, though :)


Yeah, sure. Just wanted to place the code at the same thread. There are
some tricks that need to be done in order to handle the relative paths.
It took me a lot more time to get it right than adding the replacing Regex :-)

(That reminds I should post a patch to one place where we have a
Documentation/Documentation typo)

Thanks,
Mauro
Jani Nikula June 24, 2019, 11:30 a.m. UTC | #4
On Fri, 21 Jun 2019, Jonathan Corbet <corbet@lwn.net> wrote:
> Rather than fill our text files with :c:func:`function()` syntax, just do

> the markup via a hook into the sphinx build process.

>

> Signed-off-by: Jonathan Corbet <corbet@lwn.net>

> ---

>  Documentation/conf.py              |  3 +-

>  Documentation/sphinx/automarkup.py | 80 ++++++++++++++++++++++++++++++

>  2 files changed, 82 insertions(+), 1 deletion(-)

>  create mode 100644 Documentation/sphinx/automarkup.py

>

> diff --git a/Documentation/conf.py b/Documentation/conf.py

> index 7ace3f8852bd..a502baecbb85 100644

> --- a/Documentation/conf.py

> +++ b/Documentation/conf.py

> @@ -34,7 +34,8 @@ needs_sphinx = '1.3'

>  # Add any Sphinx extension module names here, as strings. They can be

>  # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom

>  # ones.

> -extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']

> +extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',

> +              'kfigure', 'sphinx.ext.ifconfig', 'automarkup']

>  

>  # The name of the math extension changed on Sphinx 1.4

>  if (major == 1 and minor > 3) or (major > 1):

> diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py

> new file mode 100644

> index 000000000000..14b09b5d145e

> --- /dev/null

> +++ b/Documentation/sphinx/automarkup.py

> @@ -0,0 +1,80 @@

> +# SPDX-License-Identifier: GPL-2.0

> +# Copyright 2019 Jonathan Corbet <corbet@lwn.net>

> +#

> +# Apply kernel-specific tweaks after the initial document processing

> +# has been done.

> +#

> +from docutils import nodes

> +from sphinx import addnodes

> +import re

> +

> +#

> +# Regex nastiness.  Of course.

> +# Try to identify "function()" that's not already marked up some

> +# other way.  Sphinx doesn't like a lot of stuff right after a

> +# :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last

> +# bit tries to restrict matches to things that won't create trouble.

> +#

> +RE_function = re.compile(r'([\w_][\w\d_]+\(\))')

> +

> +#

> +# The DVB docs create references for these basic system calls, leading

> +# to lots of confusing links.  So just don't link them.

> +#

> +Skipfuncs = [ 'open', 'close', 'write' ]

> +

> +#

> +# Find all occurrences of function() and try to replace them with

> +# appropriate cross references.

> +#

> +def markup_funcs(docname, app, node):

> +    cdom = app.env.domains['c']

> +    t = node.astext()

> +    done = 0

> +    repl = [ ]

> +    for m in RE_function.finditer(t):

> +        #

> +        # Include any text prior to function() as a normal text node.

> +        #

> +        if m.start() > done:

> +            repl.append(nodes.Text(t[done:m.start()]))

> +        #

> +        # Go through the dance of getting an xref out of the C domain

> +        #

> +        target = m.group(1)[:-2]

> +        target_text = nodes.Text(target + '()')

> +        xref = None

> +        if target not in Skipfuncs:

> +            lit_text = nodes.literal(classes=['xref', 'c', 'c-func'])

> +            lit_text += target_text

> +            pxref = addnodes.pending_xref('', refdomain = 'c',

> +                                          reftype = 'function',

> +                                          reftarget = target, modname = None,

> +                                          classname = None)

> +            xref = cdom.resolve_xref(app.env, docname, app.builder,

> +                                     'function', target, pxref, lit_text)

> +        #

> +        # Toss the xref into the list if we got it; otherwise just put

> +        # the function text.

> +        #

> +        if xref:

> +            repl.append(xref)

> +        else:

> +            repl.append(target_text)

> +        done = m.end()

> +    if done < len(t):

> +        repl.append(nodes.Text(t[done:]))

> +    return repl

> +

> +def auto_markup(app, doctree, name):

> +    for para in doctree.traverse(nodes.paragraph):

> +        for node in para.traverse(nodes.Text):

> +            if not isinstance(node.parent, nodes.literal):

> +                node.parent.replace(node, markup_funcs(name, app, node))


I think overall this is a better approach than preprocessing. Thanks for
doing this!

I toyed with something like this before, and the key difference here
seems to be ignoring literal blocks. The problem seemed to be that
replacing blocks with syntax highlighting also removed the syntax
highlighting, with no way that I could find to bring it back.

Obviously it would be great to be able to add the cross references in
more places than just bulk text nodes, but this is a good start.

BR,
Jani.


> +

> +def setup(app):

> +    app.connect('doctree-resolved', auto_markup)

> +    return {

> +        'parallel_read_safe': True,

> +        'parallel_write_safe': True,

> +        }


-- 
Jani Nikula, Intel Open Source Graphics Center
Jonathan Corbet June 24, 2019, 2:25 p.m. UTC | #5
On Mon, 24 Jun 2019 14:30:47 +0300
Jani Nikula <jani.nikula@linux.intel.com> wrote:

> > +def auto_markup(app, doctree, name):

> > +    for para in doctree.traverse(nodes.paragraph):

> > +        for node in para.traverse(nodes.Text):

> > +            if not isinstance(node.parent, nodes.literal):

> > +                node.parent.replace(node, markup_funcs(name, app, node))  

> 

> I think overall this is a better approach than preprocessing. Thanks for

> doing this!

> 

> I toyed with something like this before, and the key difference here

> seems to be ignoring literal blocks. The problem seemed to be that

> replacing blocks with syntax highlighting also removed the syntax

> highlighting, with no way that I could find to bring it back.


That test could use a comment, really.  What it is actually doing is
skipping text chunks in ``inline literal`` sections, and what that is
*actually* doing is avoiding marking up functions that have an
explicit :c:func: markup on them already.

Someday I don't doubt that this loop will be replaced by a proper tree
walk that knows where to prune things and how to replace various other
types of nodes, but this is easy and does the right thing pretty much
everywhere as far as I can tell.

Thanks,

jon
Jonathan Corbet June 24, 2019, 2:29 p.m. UTC | #6
On Sat, 22 Jun 2019 14:46:10 -0300
Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:

> > > .. c:function:: int ioctl( int fd, int request, void *argp )

> > >     :name: v4l2-ioctl    

> > 

> > Some digging around didn't turn up any documentation for :name:, but it

> > seems to prevent ioctl() from going into the list of functions that can be

> > cross-referenced.   

> 

> It took me a while to discover this way to be able to re-define the

> name of a symbol at the C domain, but I'm pretty sure I read this

> somewhere at the Sphinx docs (or perhaps on some bug track or Stack

> Overflow).

> 

> I don't remember exactly where I get it, but I guess it is related to

> this:

> 

> 	http://docutils.sourceforge.net/docs/howto/rst-roles.html

> 

> > I wonder if the same should be done for the others?  

> 

> Sure.


It actually occurs to me that it might be better to keep the skip list and
maybe expand it.  There are vast numbers of places where people write
open() or whatever(), and there is no point in even trying to
cross-reference them.  I should do some tests, it might even make a
measurable difference in the build time to skip them :)  And in any case,
somebody is bound to put one of those common names into the namespace in
the future, recreating the current problem.

jon
Mauro Carvalho Chehab June 24, 2019, 4:38 p.m. UTC | #7
Em Mon, 24 Jun 2019 08:29:50 -0600
Jonathan Corbet <corbet@lwn.net> escreveu:

> On Sat, 22 Jun 2019 14:46:10 -0300

> Mauro Carvalho Chehab <mchehab+samsung@kernel.org> wrote:

> 

> > > > .. c:function:: int ioctl( int fd, int request, void *argp )

> > > >     :name: v4l2-ioctl      

> > > 

> > > Some digging around didn't turn up any documentation for :name:, but it

> > > seems to prevent ioctl() from going into the list of functions that can be

> > > cross-referenced.     

> > 

> > It took me a while to discover this way to be able to re-define the

> > name of a symbol at the C domain, but I'm pretty sure I read this

> > somewhere at the Sphinx docs (or perhaps on some bug track or Stack

> > Overflow).

> > 

> > I don't remember exactly where I get it, but I guess it is related to

> > this:

> > 

> > 	http://docutils.sourceforge.net/docs/howto/rst-roles.html

> >   

> > > I wonder if the same should be done for the others?    

> > 

> > Sure.  

> 

> It actually occurs to me that it might be better to keep the skip list and

> maybe expand it.  There are vast numbers of places where people write

> open() or whatever(), and there is no point in even trying to

> cross-reference them. 


Yeah, subsystems will likely have their own "man pages" for for sysctls.

Both the dvb and the v4l2 ones are part of what used to be the DocBook
manpages for those syscalls. If I'm not mistaken, we ended by expanded 
the same approach for the media controller, for CEC and for RC.

> I should do some tests, it might even make a

> measurable difference in the build time to skip them :)  And in any case,

> somebody is bound to put one of those common names into the namespace in

> the future, recreating the current problem.


There is one way of keeping it while avoiding troubles: you could
create internal names, for example using the current dir, auto-adding
the ":name:" tag when a declaration conflict rises. Not sure how
easy/hard would be to implement it.

Btw, at get_abi.pl, I had to do a trick like that, as some symbols
have a "local context". The good thing at ABI files is that the context
is clear: it is valid only between "What:" and "Description:" fields.

Also, as it is a single script that parses the entire ABI (it takes
on ~0.1 seconds to parse everything and store internally on my machine),
the logic there detects when an existing symbol is re-used on a different
context. When this happens, it adds a random char at the end of the
internal reference, while keeping the original name[1].

[1] on the the highly unlikely event that the new name still repeats,
it adds a new random char - until the name gets different.

Probably doing it at automarkup won't be that simple, but it could
work.

Thanks,
Mauro
diff mbox series

Patch

diff --git a/Documentation/conf.py b/Documentation/conf.py
index 7ace3f8852bd..a502baecbb85 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -34,7 +34,8 @@  needs_sphinx = '1.3'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']
+extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain',
+              'kfigure', 'sphinx.ext.ifconfig', 'automarkup']
 
 # The name of the math extension changed on Sphinx 1.4
 if (major == 1 and minor > 3) or (major > 1):
diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py
new file mode 100644
index 000000000000..14b09b5d145e
--- /dev/null
+++ b/Documentation/sphinx/automarkup.py
@@ -0,0 +1,80 @@ 
+# SPDX-License-Identifier: GPL-2.0
+# Copyright 2019 Jonathan Corbet <corbet@lwn.net>
+#
+# Apply kernel-specific tweaks after the initial document processing
+# has been done.
+#
+from docutils import nodes
+from sphinx import addnodes
+import re
+
+#
+# Regex nastiness.  Of course.
+# Try to identify "function()" that's not already marked up some
+# other way.  Sphinx doesn't like a lot of stuff right after a
+# :c:func: block (i.e. ":c:func:`mmap()`s" flakes out), so the last
+# bit tries to restrict matches to things that won't create trouble.
+#
+RE_function = re.compile(r'([\w_][\w\d_]+\(\))')
+
+#
+# The DVB docs create references for these basic system calls, leading
+# to lots of confusing links.  So just don't link them.
+#
+Skipfuncs = [ 'open', 'close', 'write' ]
+
+#
+# Find all occurrences of function() and try to replace them with
+# appropriate cross references.
+#
+def markup_funcs(docname, app, node):
+    cdom = app.env.domains['c']
+    t = node.astext()
+    done = 0
+    repl = [ ]
+    for m in RE_function.finditer(t):
+        #
+        # Include any text prior to function() as a normal text node.
+        #
+        if m.start() > done:
+            repl.append(nodes.Text(t[done:m.start()]))
+        #
+        # Go through the dance of getting an xref out of the C domain
+        #
+        target = m.group(1)[:-2]
+        target_text = nodes.Text(target + '()')
+        xref = None
+        if target not in Skipfuncs:
+            lit_text = nodes.literal(classes=['xref', 'c', 'c-func'])
+            lit_text += target_text
+            pxref = addnodes.pending_xref('', refdomain = 'c',
+                                          reftype = 'function',
+                                          reftarget = target, modname = None,
+                                          classname = None)
+            xref = cdom.resolve_xref(app.env, docname, app.builder,
+                                     'function', target, pxref, lit_text)
+        #
+        # Toss the xref into the list if we got it; otherwise just put
+        # the function text.
+        #
+        if xref:
+            repl.append(xref)
+        else:
+            repl.append(target_text)
+        done = m.end()
+    if done < len(t):
+        repl.append(nodes.Text(t[done:]))
+    return repl
+
+def auto_markup(app, doctree, name):
+    for para in doctree.traverse(nodes.paragraph):
+        for node in para.traverse(nodes.Text):
+            if not isinstance(node.parent, nodes.literal):
+                node.parent.replace(node, markup_funcs(name, app, node))
+
+def setup(app):
+    app.connect('doctree-resolved', auto_markup)
+    return {
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+        }