[luatex] asking for style/code critique

Werner LEMBERG wl at gnu.org
Sun Dec 18 20:58:28 CET 2022


Folks,


attached is my first serious attempt to write a 'hyphenate' callback
(to be used for LilyPond's Texinfo documentation).  It is actually my
first Lua program ever.  Fortunately, initial tests indicate that it
works aas expected :-)

Please have a quick look and check whether you can see any big
problems – mainly stylewise (besides using two spaces for indentation
instead of three) but perhaps codewise, too.

Thanks in advance.


    Werner
-------------- next part --------------
-- code.lua
--
-- Written 2022 by Werner Lemberg <wl at gnu.org>.
--
-- If the LilyPond documentation is compiled with LuaTeX (which is the
-- default), Texinfo's `@code` macro gets enhanced by this Lua code to
-- do the following things.
--
-- (0) For further analysis, break the argument to `@code` into words.
--     Words that already contain discretionaries and penalties (for
--     example, by inserting `@/` or `@-`) are ignored.
-- (1) Insert hyphenation points in camel-case words.
-- (2) Add possible line breaks after `-` and `_`.
-- (3) Avoid that single-character words at the start or the end of
--     the `@code` argument are positioned at the end or start of an
--     output line, respectively.  A typical case with LilyPond code
--     is `@code{@{ ... @}}`.
--
-- There will be at least two characters at the start or the end of a
-- word before inserting a hyphenation point or a possible line break.
--
-- Note that this code only works if `@allowcodebreaks false` is set.


-- Some shorthands.
char_hyphen = 0x2d

umatch = unicode.utf8.match
uchar = unicode.utf8.char

DISC = node.id("disc")
GLUE = node.id("glue")
GLYPH = node.id("glyph")
HLIST = node.id("hlist")
PENALTY = node.id("penalty")


-- This value must be the same as set in `common-macros.itexi` (using
-- `\attribute`).
code_attribute = 200


typography = function(head)
  local words = {}
  local idx = 1
  local word_start = 0
  local word_end = 0
  local word_len = 0
  local prev_font = -1
  local only_characters = true

  -- Loop over all nodes to find start, end, and length of words, to
  -- be stored in array `words`.
  for n in node.traverse(head) do
    local in_word = false

    -- Handle only stuff in `@code`.
    if node.has_attribute(n, code_attribute) then
      -- Only characters typeset with same font are considered to be
      -- in the same word.
      if n.id == GLYPH and (prev_font == -1 or prev_font == n.font) then
        in_word = true
      end

      -- We don't handle discretionaries and penalties outside of a
      -- word.
      if (n.id == DISC or n.id == PENALTY) and word_len > 0 then
        in_word = true
        only_characters = false
      end
    end

    if in_word then
      if word_len == 0 then
        word_start = n
      end

      word_end = n
      word_len = word_len + 1

      if n.id == GLYPH then
        prev_font = n.font
      end
    else
      -- Ignore words that already contain discretionaries or
      -- penalties.
      if word_len > 0 and only_characters then
        words[idx] = { word_start, word_end, word_len }
        idx = idx + 1
      end

      only_characters = true

      -- The current node might still start a new word.
      if node.has_attribute(n, code_attribute) and n.id == GLYPH then
        in_word = true
        word_start = n
        word_end = n
        word_len = 1
        prev_font = n.font
      else
        word_len = 0
        prev_font = -1
      end
    end
  end

  -- Now loop over all collected words.
  for _, word in ipairs(words) do
    word_start, word_end, word_len = table.unpack(word)

    -- Check for `[<letter>_] [<letter>_] [-_] <letter> [<letter>_]`
    -- and insert a penalty after `-` (or `_`) if we have a hit.
    if word_len >= 5 then
      local start = word_start
      local len = word_len

      while len >= 5 do
        local c1 = start
        local c2 = c1.next
        local c3 = c2.next
        local c4 = c3.next
        local c5 = c4.next

        if umatch(uchar(c1.char), "[%l%u_]")
           and umatch(uchar(c2.char), "[%l%u_]")
           and umatch(uchar(c3.char), "[_-]")
           and umatch(uchar(c4.char), "[%l%u]")
           and umatch(uchar(c5.char), "[%l%u_]") then
          local pen = node.new(PENALTY)
          pen.penalty = tex.hyphenpenalty
          node.set_attribute(pen, code_attribute, 1)
          node.insert_after(head, c3, pen)

          len = len - 3
          start = c4
        else
          len = len - 1
          start = c2
        end
      end
    end

    -- Check for `<letter> <lowercase> <uppercase> <letter>` and
    -- insert the equivalent to `\discretionary{-}{}{}` after
    -- `<lowercase>` if we have a hit.
    if word_len >= 4 then
      local start = word_start
      local len = word_len

      while len >= 4 do
        local c1 = start
        local c2 = c1.next
        local c3 = c2.next
        local c4 = c3.next

        -- The previous while-loop might have inserted penalty nodes;
        -- we thus have to additionally check `c1` to `c4` for
        -- validness.
        if c1.id == GLYPH
           and c2.id == GLYPH
           and c3.id == GLYPH
           and c4.id == GLYPH
           and umatch(uchar(c1.char), "[%l%u]")
           and umatch(uchar(c2.char), "%l")
           and umatch(uchar(c3.char), "%u")
           and umatch(uchar(c4.char), "[%l%u]") then
          local hyphen = node.new(GLYPH)
          hyphen.subtype = 1
          hyphen.font = c2.font
          hyphen.char = char_hyphen

          local disc = node.new(DISC)
          disc.subtype = 1
          disc.penalty = tex.hyphenpenalty
          disc.pre = hyphen
          node.set_attribute(disc, code_attribute, 1)
          node.insert_after(head, c2, disc)

          len = len - 2
          start = c3
        else
          len = len - 1
          start = c2
        end
      end
    end
  end -- end of for-loop

  for n in node.traverse(head) do
    -- Check whether there is a single character at the beginning of
    -- `@code`, followed by a space.  If we have a hit, insert a
    -- penalty after the character.
    local non_code = n
    if not node.has_attribute(non_code, code_attribute) then
      local char = non_code.next
      if char
         and node.has_attribute(char, code_attribute)
         and char.id == GLYPH then
        local space = char.next
        if space
           and node.has_attribute(space, code_attribute)
           and space.id == GLUE then
          local pen = node.new(PENALTY)
          pen.penalty = 10000
          node.set_attribute(pen, code_attribute, 1)
          node.insert_after(head, char, pen)
        end
      end
    end
    
    -- Check whether there is a single character at the end of
    -- `@code`, preceded by a space.  If we have a hit, insert a
    -- penalty before the space.
    local space = n
    if node.has_attribute(space, code_attribute)
       and space.id == GLUE then
      local char = space.next
      if char
         and node.has_attribute(char, code_attribute)
         and char.id == GLYPH then
        -- We actually have to check for one more node because `@code`
        -- ends with a call to `\null`, which creates an empty hbox.
        local hbox = char.next
        if hbox
           and node.has_attribute(hbox, code_attribute)
           and hbox.id == HLIST then
          local non_code = hbox.next
          if non_code
             and not node.has_attribute(non_code, code_attribute) then
            local pen = node.new(PENALTY)
            pen.penalty = 10000
            node.set_attribute(pen, code_attribute, 1)
            node.insert_before(head, space, pen)
          end
        end
      end
    end
  end -- end of for-loop
end

-- eof


More information about the luatex mailing list.