Date: February 17, 2025

Writing a syntax highlighter

Since, I have my own tool to convert Markdown text to plain HTML blogs,
which I am very proud of. The only thing it was lacking for my use case was a
syntax highlighter for the code that I share, which I have been planning to implement for months.

Implementation

Current implementation does a simple regex replace of a keyword, making use of <font> tag in HTML.
I have only added C support for now, since it's the only language I have touched for months now.
There are some flaws with this implementation which you will be able to see below, that can be fixed by
handling edge-cases (a lot of edge cases) and by implementing a tiny lexer.
Code and example 😀:


/**
 * C programming language
 **/
static struct keyword c_keywords[] = {
  { "#include",   "#E91E63" },
  { "#define ",   "#E91E63" },
  { "for ",       "#D84315" },
  { "while ",     "#D84315" },
  { "do ",        "#D84315" },
  { "break",      "#D84315" },
  { "if ",        "#D84315" },
  { "else ",      "#D84315" },
  { "switch ",    "#D84315" },
  { "continue",   "#D84315" },
  { "return ",    "#D84315" },
  { "int ",       "#6A1B9A" },
  { "char ",      "#6A1B9A" },
  { "float ",     "#6A1B9A" },
  { "double ",    "#6A1B9A" },
  { "long ",      "#6A1B9A" },
  { "short ",     "#6A1B9A" },
  { "unsigned ",  "#6A1B9A" },
  { "signed ",    "#6A1B9A" },
  { "void ",      "#6A1B9A" },
  { "struct ",    "#1565C0" },
  { "union ",     "#1565C0" },
  { "enum ",      "#1565C0" },
  { "sizeof",     "#D84315" },
  { "typedef ",   "#D84315" },
  { "enum ",      "#D84315" },
};


static char *
highlight_keywords (char           *codeblk,
                    struct keyword  keywords[],
                    int             n_keywords)
{
  int size = 1000;
  int count = 0;
  char *highlighted = NULL;
  char *ptr = NULL;

  highlighted = malloc (sizeof (char) * size);

  ptr = codeblk;

  while (*ptr != '\0')
    {
      int i = n_keywords;

      if (count == size - 1)
        {
          size <<= 2;

          highlighted = realloc (highlighted, size);
        }

      if (isspace (*ptr))
        {
          highlighted[count++] = *ptr++;
          continue;
        }

      for (i = 0; i < n_keywords; i++)
        {
          char *keyword;

          keyword = keywords[i].str;
          if (strncmp (ptr, keyword, strlen (keyword)) == 0)
            break;
        }

      if (i < n_keywords)
        {
          char *cpy, *org;
          char *strs[5] = {
            "<font color=\"", NULL, "\">",
            NULL,
            "</font>"
          };

          strs[1] = keywords[i].color;
          strs[3] = keywords[i].str;

          cpy = &highlighted[count];
          org = cpy;

          for (int i = 0; i < 5; i++)
            {
              strcpy (cpy, strs[i]);
              cpy += strlen (strs[i]);
              count += strlen (strs[i]);
            }

          ptr += strlen (keywords[i].str);
        }
      else /* Not a keyword */
        {
          unsigned int index;

          index = get_char_index (*ptr);

          if (index != N_CHARS)
            {
              char *cpy = NULL;

              cpy = &highlighted[count];
              strcpy (cpy, chars[index].str);

              count += strlen (chars[index].str);
              ptr++;
            }
          else
            {
              highlighted[count++] = *ptr++;
            }
        }
    }

  highlighted[count] = '\0';

  return highlighted;
}


TODO

in-short: Implement a tiny lexer