Newer
Older
/*
** lexer.c for 42sh
**
** Made by Seblu
** Login <seblu@epita.fr>
**
** Started on Sun Jul 30 04:36:53 2006 Seblu
/*
** Token recognition rationale:
** - Separator are chars which used to cut token but which are deleted.
** - Operators are chars which used to cut token and which are returned as token.
** - Quotes are chars which must be return entierrely.
/*
** ============
** DECLARATIONS
** ============
*/
// Order is very important for correct recognition !
{TOK_NEWLINE, "\n", 1},
{TOK_AND, "&&", 2},
{TOK_SEPAND, "&", 1},
{TOK_OR, "||", 2},
{TOK_PIPE, "|", 1},
{TOK_DSEMI, ";;", 2},
{TOK_SEP, ";", 1},
{TOK_DLESSDASH, "<<-", 3},
{TOK_DLESS, "<<", 2},
{TOK_LESSGREAT, "<>", 2},
{TOK_LESSAND, "<&", 2},
{TOK_LESS, "<", 1},
{TOK_DGREAT, ">>", 2},
{TOK_GREATAND, ">&", 2},
{TOK_CLOBBER, ">|", 2},
{TOK_GREAT, ">", 1},
{TOK_NONE, NULL, 0}
typedef struct s_quote
{
const char *start;
const size_t lenstart;
const char *stop;
const size_t lenstop;
} ts_quote;
static const ts_quote quotes[] =
{
{"\"", 1, "\"", 1},
{"'", 1, "'", 1},
{"`", 1, "`", 1},
{"${", 2, "}", 1},
{"$(", 2, ")", 1},
{"$((", 2, "))", 2},
{NULL, 0, NULL, 0},
};
** Check if @arg buf + *buf_pos point on the start of quote sequence.
** @warning Recognition start at buf + *buf_pos !
** @param buf a string buffer
** @param buf_pos position in the buffer, which is incremented if found to point on
** the last char of the quote !
** @param quote quote type found
**
** @return true (!0) if a quote is found, else false (0)
*/
static int is_quote_start(const char *buf, size_t *buf_pos, const ts_quote **quote);
/*!
** Check if @arg buf + *buf_pos point on the stop of quote sequence.
** @warning Recognition start at buf + *buf_pos !
**
** @param buf a string buffer
** @param buf_pos position in the buffer, which is incremented if found to point on
** the last char of the quote !
** @param quote quote type to found
** @return true (!0) if a quote is found, else false (0)
static int is_quote_stop(const char *buf, size_t *buf_pos, const ts_quote *quote);
/*!
** Return a predicat about c is a separator
**
** @param c Must be a character
**
** @return true if c is a separator
*/
** Check if the buffer point to an operator. Il it's true and buf_pos is
** not NULL, *buf_pos is correctly incremented to point on the next token
** @warning Recgnition start at buf !
**
** @param buf a string buffer where recognition start
** @param buf_pos buffer position to increment correctly if operator is found
** @param token reconized token operator
**
** @return true (!0) if find, else false (0)
*/
static int is_operator(const char *buf, size_t *buf_pos, s_token *token);
/*!
** Read lexer's stream, and return the next token.
/*!
** This function is only call when the end of a line occur in
** a quote or after a backslash
**
** @param lexer lexer struct
**
** @return 1 if can read a line, 0 if eof
*/
/*!
** Correctly set a token. In first, it call macro token_free to
** desallow memory if it's a word.
**
** @param token token to set
** @param id new token id
** @param s new token string
*/
static void token_set(s_token *token, e_tokenid id, const char *s);
/*
** ===========
** DEFINITIONS
** ===========
*/
secmalloc(new, sizeof (s_lexer));
new->stream = getln_open(fd);
new->buf = NULL;
new->buf_size = new->buf_pos = 0;
new->token.id = TOK_NONE;
new->token.str = NULL;
new->token.len = 0;
new->eof = 0;
buf = lexer->token;
lexer->token.id = TOK_NONE;
lexer->token.str = NULL;
return buf;
s_token lexer_getheredoc(s_lexer *lexer, const char *delim)
s_token token;
char *buf = NULL;
char *line;
if (lexer->eof) {
token_set(&token, TOK_EOF, "EOF");
return token;
}
show_prompt(PROMPT_PS2);
do {
line = getln(lexer->stream);
if (line == NULL) {
lexer->eof = 1;
break;
}
buf = strmerge(2, buf, line);
}
while (strcmp(line, delim));
token_set(&token, TOK_WORD, buf);
return token;
static void token_set(s_token *token, e_tokenid id, const char *s)
if (token->id == TOK_WORD)
free((char*) token->str);
token->id = id;
token->str = s;
if (s) token->len = strlen(s);
else token->len = 0;
if (lexer->buf && lexer->buf_pos == lexer->buf_size) {
free(lexer->buf);
lexer->buf = NULL;
lexer->buf_size = 0;
}
//read a line if buf is empty
if (!lexer->buf_size && !lexer->eof && (lexer->buf = getln(lexer->stream))) {
lexer->eof = 1;
token_set(&lexer->token, TOK_EOF, "EOF");
return;
//cut a slice of stream
while (!lexer_cut(lexer))
;; //retry again
buf = lexer->buf;
if (lexer->buf_size > 0 && buf[lexer->buf_size - 1] == '\n')
buf[lexer->buf_size - 1] = 0;
if (lexer->buf_size > 1 && buf[lexer->buf_size - 2] == '\\')
buf[lexer->buf_size - 2] = 0;
//show incomplet recognition prompt
show_prompt(PROMPT_PS2);
//retrieve a new line
lexer->eof = 1;
return 0;
}
lexer->buf = strmerge(2, buf, buf2);
lexer->buf_size = strlen(lexer->buf);
free(buf);
free(buf2);
return 1;
}
size_t *buf_pos = &lexer->buf_pos, token_start, token_pos;
char backed = 0, quoted = 0;
const ts_quote*quote;
// Rationale: Search begin of token
//eat separators (" ",\t, \v)
while (buf[*buf_pos] && is_sep(buf[*buf_pos]))
++*buf_pos;
if (buf[*buf_pos] == '#')
while (buf[*buf_pos] && buf[*buf_pos] != '\n')
++*buf_pos;
//check if first chars is an operator
if (is_operator(buf + *buf_pos, buf_pos, &lexer->token))
return 1;
// Rationale: Search end of token
for (; buf[token_pos]; ++token_pos) {
// backslah newline => eatline
if ((backed || quoted) && buf[token_pos] == '\n' && lexer_eatline(lexer))
return 0; //new line added, you can try again
// backed, go to next char
else if (backed) backed = 0;
// check end of quoting
else if (quoted && is_quote_stop(buf, &token_pos, quote)) quoted = 0;
// quotin not ended !
else if (quoted) continue;
// if backslash go in state backed
else if (!backed && buf[token_pos] == '\\') backed = 1;
// if sep, a token was found !
else if (is_sep(buf[token_pos])) end_found = 1;
// if it's an operator cut
else if (is_operator(buf + token_pos, NULL, NULL)) end_found = 1;
// check to start quoting
else if (!quoted && is_quote_start(buf, &token_pos, "e)) quoted = 1;
lexer->buf_pos = token_pos; //update real lexer position buffer
token_set(&lexer->token, TOK_WORD,
strndup(buf + token_start, token_pos - token_start));
static int is_operator(const char *buf, size_t *buf_pos, s_token *token)
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
{
for (register int i = 0; operators[i].id != TOK_NONE; ++i)
if (!strncmp(buf, operators[i].str, operators[i].len)) {
if (buf_pos)
*buf_pos += operators[i].len;
if (token)
token_set(token, operators[i].id, operators[i].str);
return 1;
}
return 0;
}
static int is_quote_start(const char *buf, size_t *buf_pos, const ts_quote **quote)
{
for (register int i = 0; quotes[i].start; ++i)
if (!strncmp(buf + *buf_pos, quotes[i].start, quotes[i].lenstart)) {
*buf_pos += quotes[i].lenstart - 1;
if (quote)
*quote = quotes + i;
return 1;
}
return 0;
}
static int is_quote_stop(const char *buf, size_t *buf_pos, const ts_quote *quote)
{
if (!strncmp(buf + *buf_pos, quote->stop, quote->lenstop)) {
*buf_pos += quote->lenstop - 1;
return 1;
}
return 0;
}