Writing a syntax highlighter - part 2
Continuation to my previous blog, The syntax highlighter
previously I wrote was quite primitive and ugly. Writing a more sophisticated
version of it made me realize that syntax highlighters aren't just re.replace()
python scripts, there's much more in them.
For testing the highlighter, I used the highlighter C code itself to ensure
it's working as intended (Kind of self-hosting, in a way).
It supports very small subset of C and will eventually grow once I start to using it more.
The good thing about latest approach is, I can re-use all the data structures that I have
created for the C syntax highlighter in other programming languages too!
You can find latest code here: md2html/src/syntax.c and here:
static struct keywords_set *c_keywords[4] = {
[0] = & (struct keywords_set) {
(struct keyword[]) {
{ "#include", "#E91E63" },
{ "#define ", "#E91E63" },
{ NULL, NULL }
}
},
[1] = & (struct keywords_set) {
(struct keyword[]) {
{ "for", "#D84315" },
{ "while", "#D84315" },
{ "do", "#D84315" },
{ "break", "#D84315" },
{ "if", "#D84315" },
{ "else", "#D84315" },
{ "switch", "#D84315" },
{ "continue", "#D84315" },
{ "return", "#D84315" },
{ "case", "#D84315" },
{ "default", "#D84315" },
{ NULL, NULL }
}
},
[2] = & (struct keywords_set) {
(struct keyword[]) {
{ "int", "#0000bb" },
{ "char", "#0000bb" },
{ "float", "#0000bb" },
{ "double", "#0000bb" },
{ "long", "#0000bb" },
{ "short", "#0000bb" },
{ "unsigned", "#0000bb" },
{ "bool", "#0000bb" },
{ "signed", "#0000bb" },
{ "static", "#0000bb" },
{ "const", "#0000bb" },
{ "struct", "#0000bb" },
{ "void", "#0000bb" },
{ "size_t", "#0000bb" },
{ NULL, NULL }
}
},
[3] = & (struct keywords_set) {
(struct keyword[]) {
{ "union ", "#0000bb" },
{ "enum", "#0000bb" },
{ "sizeof", "#D84315" },
{ "typedef", "#D84315" },
{ "enum", "#D84315" },
{ "true", "#6A1B9A" },
{ "false", "#6A1B9A" },
{ "NULL", "#6A1B9A" },
{ NULL, NULL }
}
}
};
static bool
__isalnum (char c)
{
if (c == '_')
return true;
return isalnum (c);
}
static bool
isescape_sequence (char *ptr)
{
if (* (ptr) != '\\')
return false;
else if (* (ptr - 1) == '\\')
return false;
return true;
}
static size_t
get_number_length (char *str)
{
/* we have already processed first character */
size_t size = 1;
str++;
/* FIXME: handle binary and hexadecimal numbers */
while (*str != '\0')
{
if (!NUMBER_TOKEN (*str))
break;
str++;
size++;
}
return size;
}
static size_t
extract_number (char *str,
char *buf)
{
size_t size;
size = get_number_length (str);
if (size != 0)
{
strncpy (buf, str, size);
}
return size;
}
static size_t
extract_text (char *start,
char *buf,
size_t buf_len,
char *pattern)
{
size_t size = 0;
char *str_start, *needle;
needle = start;
do {
needle = strstr (needle + 1, pattern);
/* skip escape sequences */
if (needle && !isescape_sequence (needle - 1))
break;
} while (needle != NULL);
if (needle != NULL)
{
size = needle - start + 1;
xml_sanitize_strcpy (buf, start, size);
}
return size;
}
static char *
highlight_keywords (char *codeblk,
struct keywords_set **set,
int n_keyword_types)
{
int size = 1000;
int count = 0;
char *highlighted = NULL;
char *ptr = NULL;
struct keyword string;
highlighted = malloc (sizeof (char) * size);
ptr = codeblk;
while (*ptr != '\0')
{
int i;
struct keyword *match = NULL;
char buf[300] = { 0 }; /* FIXME */
bool advance_ptr = true;
/* This is a bit risky;
* implement better logic or add a wrapper over strcpy */
if (count == size - 200)
{
size <<= 1;
highlighted = realloc (highlighted, size);
}
if (STRING_CHAR_TOKEN (*ptr))
{
size_t size;
char *pattern;
if (STRING_TOKEN (*ptr))
{
pattern = "\"";
}
else
{
pattern = "\'";
}
size = extract_text (ptr, buf, sizeof (buf), pattern);
if (size != 0)
{
string.str = buf;
string.color = "#6A1B9A";
match = &string;
ptr += size;
advance_ptr = false;
}
}
else if (COMMENT_TOKEN (ptr))
{
size_t size;
size = extract_text (ptr, buf, sizeof (buf), "*/");
if (size != 0)
{
string.str = buf;
string.color = "#006400";
match = &string;
ptr += size;
advance_ptr = false;
}
}
else if (NUMBER_TOKEN (*ptr))
{
size_t size;
size = extract_number (ptr, buf);
if (size != 0)
{
string.str = buf;
string.color = "#006400";
match = &string;
ptr += size;
advance_ptr = false;
}
}
else if (isspace (*ptr))
{
highlighted[count++] = *ptr++;
continue;
}
for (i = 0; i < n_keyword_types; i++)
{
struct keywords_set *type_set;
struct keyword *type;
type_set = set[i];
type = set[i]->keywords;
for (int j = 0; type[j].str != NULL; j++)
{
const char *keyword;
keyword = type[j].str;
if (strncmp (ptr, keyword, strlen (keyword)) == 0)
{
if (!__isalnum (* (ptr - 1))
&& !__isalnum (* (ptr + strlen (keyword))))
{
match = &type[j];
break;
}
}
}
}
if (match != NULL)
{
char *cpy, *org;
const char *strs[5] = {
"<font color=\"", NULL, "\">",
NULL,
"</font>"
};
strs[1] = match->color;
strs[3] = match->str;
cpy = &highlighted[count];
org = cpy;
for (int i = 0; i < 5; i++)
{
strcpy (cpy, strs[i]);
cpy += strlen (strs[i]);
count += strlen (strs[i]);
}
if (advance_ptr)
ptr += strlen (match->str);
}
else /* Not a keyword */
{
const char *str;
str = xml_char_replace (*ptr);
if (str != NULL)
{
char *cpy = NULL;
cpy = &highlighted[count];
strcpy (cpy, str);
count += strlen (str);
ptr++;
}
else
{
highlighted[count++] = *ptr++;
}
}
}
highlighted[count] = '\0';
return highlighted;
}
char *
syntax_highlight (char *codeblk,
Lang lang)
{
char *highlighted = NULL;
int n_types;
switch (lang)
{
case LANG_C:
n_types = ARRAY_SIZE (c_keywords);
highlighted = highlight_keywords (codeblk,
c_keywords, n_types);
break;
default:
highlighted = NULL;
}
return highlighted;
}