Newer
Older
/*
** lexer.c for 42sh
**
** Made by Seblu
** Login <seblu@epita.fr>
**
** Started on Sun Jul 30 04:36:53 2006 Seblu
** Last update Fri Aug 25 15:16:23 2006 Seblu
#include "parser.h"
#include "../shell/shell.h"
#include "../readline/readline.h"
/*
** Token recognition rationale:
** - Separator are chars which used to cut token but which are deleted.
** - Operators are chars which used to cut token and which are returned as token.
** - Quotes are chars which must be return entierrely.
/*
** ============
** DECLARATIONS
** ============
*/
// Order is very important for correct recognition !
static const ts_token operators[] =
{TOK_NEWLINE, "\n", 1},
{TOK_AND, "&&", 2},
{TOK_SEPAND, "&", 1},
{TOK_OR, "||", 2},
{TOK_PIPE, "|", 1},
{TOK_DSEMI, ";;", 2},
{TOK_SEP, ";", 1},
{TOK_DLESSDASH, "<<-", 3},
{TOK_DLESS, "<<", 2},
{TOK_LESSGREAT, "<>", 2},
{TOK_LESSAND, "<&", 2},
{TOK_LESS, "<", 1},
{TOK_DGREAT, ">>", 2},
{TOK_GREATAND, ">&", 2},
{TOK_CLOBBER, ">|", 2},
{TOK_GREAT, ">", 1},
{TOK_NONE, NULL, 0}
typedef struct s_quote
{
const char *start;
const size_t lenstart;
const char *stop;
const size_t lenstop;
} ts_quote;
static const ts_quote quotes[] =
{
{"\"", 1, "\"", 1},
{"'", 1, "'", 1},
{"`", 1, "`", 1},
{"${", 2, "}", 1},
{"$(", 2, ")", 1},
{"$((", 2, "))", 2},
{NULL, 0, NULL, 0},
};
** Check if @arg buf + *buf_pos point on the start of quote sequence.
** @warning Recognition start at buf + *buf_pos !
** @param buf a string buffer
** @param buf_pos position in the buffer, which is incremented if found to point on
** the last char of the quote !
** @param quote quote type found
**
** @return true (!0) if a quote is found, else false (0)
*/
static int is_quote_start(const char *buf, size_t *buf_pos, const ts_quote **quote);
/*!
** Check if @arg buf + *buf_pos point on the stop of quote sequence.
** @warning Recognition start at buf + *buf_pos !
**
** @param buf a string buffer
** @param buf_pos position in the buffer, which is incremented if found to point on
** the last char of the quote !
** @param quote quote type to found
** @return true (!0) if a quote is found, else false (0)
static int is_quote_stop(const char *buf, size_t *buf_pos, const ts_quote *quote);
/*!
** Return a predicat about c is a separator
**
** @param c Must be a character
**
** @return true if c is a separator
*/
#define is_sep(c) ((c) == ' ' || (c) == '\t' || (c) == '\v')
/*!
** Check if the buffer point to an operator. Il it's true and buf_pos is
** not NULL, *buf_pos is correctly incremented to point on the next token
** @warning Recgnition start at buf !
**
** @param buf a string buffer where recognition start
** @param buf_pos buffer position to increment correctly if operator is found
** @param token reconized token operator
**
** @return true (!0) if find, else false (0)
*/
static int is_operator(const char *buf, size_t *buf_pos, ts_token *token);
/*!
** Read lexer's stream, and return the next token.
static void lexer_eattoken(ts_lexer *lex);
/*!
** This function is only call when the end of a line occur in
** a quote or after a backslash
**
** @param lexer lexer struct
**
** @return 1 if can read a line, 0 if eof
*/
static int lexer_eatline(ts_lexer *lexer);
static int lexer_cut(ts_lexer *lexer);
/*!
** Correctly set a token. In first, it call macro token_free to
** desallow memory if it's a word.
**
** @param token token to set
** @param id new token id
** @param s new token string
*/
static void token_set(ts_token *token, te_tokenid id, const char *s);
/*
** ===========
** DEFINITIONS
** ===========
*/
ts_lexer *lexer_init(FILE *fs)
ts_lexer *new;
secmalloc(new, sizeof (ts_lexer));
fflush(fs);
new->fs = fs;
new->buf = NULL;
new->buf_size = new->buf_pos = 0;
new->token.id = TOK_NONE;
new->token.str = NULL;
new->token.len = 0;
new->eof = 0;
ts_token buf = { TOK_EOF, "EOF", 3 };
buf = lexer->token;
lexer->token.id = TOK_NONE;
lexer->token.str = NULL;
return buf;
void lexer_heredocument(ts_lexer *lexer)
{
lexer = lexer;
}
static void token_set(ts_token *token, te_tokenid id, const char *s)
{
if (token->id == TOK_WORD)
free((char*) token->str);
token->id = id;
token->str = s;
if (s) token->len = strlen(s);
else token->len = 0;
static void lexer_eattoken(ts_lexer *lexer)
//if eof, set token EOF
if (lexer->eof) {
token_set(&lexer->token, TOK_EOF, "EOF");
return;
}
//if last char was read free buffer
if (lexer->buf_size > 0 && lexer->buf_pos == lexer->buf_size) {
free(lexer->buf);
lexer->buf = NULL;
lexer->buf_size = 0;
}
//read a line if buf is empty
if (!lexer->buf_size && ((lexer->buf = readline(NULL)) != NULL)) {
//if eof is read, bye bye
if (lexer->buf == NULL) {
lexer->eof = 1;
token_set(&lexer->token, TOK_EOF, "EOF");
return;
//cut a slice of stream
while (!lexer_cut(lexer))
;; //retry again
static int lexer_eatline(ts_lexer *lexer)
{
char *buf, *buf2;
buf = lexer->buf;
assert(buf);
if (lexer->buf_size > 0 && buf[lexer->buf_size - 1] == '\n')
buf[lexer->buf_size - 1] = 0;
if (lexer->buf_size > 1 && buf[lexer->buf_size - 2] == '\\')
buf[lexer->buf_size - 2] = 0;
//show incomplet recognition prompt
show_prompt(PROMPT_PS2);
//retrieve a new line
if (!(buf2 = readline(NULL))) {
lexer->eof = 1;
return 0;
}
lexer->buf = strmerge(2, buf, buf2);
lexer->buf_size = strlen(lexer->buf);
free(buf);
free(buf2);
return 1;
}
static int lexer_cut(ts_lexer *lexer)
size_t *buf_pos = &lexer->buf_pos, token_start, token_pos;
char backed = 0, quoted = 0;
const ts_quote*quote;
// Rationale: Search begin of token
//eat separators (" ",\t, \v)
while (buf[*buf_pos] && is_sep(buf[*buf_pos]))
++*buf_pos;
if (buf[*buf_pos] == '#')
while (buf[*buf_pos] && buf[*buf_pos] != '\n')
++*buf_pos;
//check if first chars is an operator
if (is_operator(buf + *buf_pos, buf_pos, &lexer->token))
return 1;
// Rationale: Search end of token
for (; buf[token_pos]; ++token_pos) {
// backslah newline => eatline
if ((backed || quoted) && buf[token_pos] == '\n' && lexer_eatline(lexer))
return 0; //new line added, you can try again
// backed, go to next char
else if (backed) backed = 0;
// check end of quoting
else if (quoted && is_quote_stop(buf, &token_pos, quote)) quoted = 0;
// quotin not ended !
else if (quoted) continue;
// if backslash go in state backed
else if (!backed && buf[token_pos] == '\\') backed = 1;
// if sep, a token was found !
else if (is_sep(buf[token_pos])) end_found = 1;
// if it's an operator cut
else if (is_operator(buf + token_pos, NULL, NULL)) end_found = 1;
// check to start quoting
else if (!quoted && is_quote_start(buf, &token_pos, "e)) quoted = 1;
lexer->buf_pos = token_pos; //update real lexer position buffer
token_set(&lexer->token, TOK_WORD,
strndup(buf + token_start, token_pos - token_start));
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
static int is_operator(const char *buf, size_t *buf_pos, ts_token *token)
{
for (register int i = 0; operators[i].id != TOK_NONE; ++i)
if (!strncmp(buf, operators[i].str, operators[i].len)) {
if (buf_pos)
*buf_pos += operators[i].len;
if (token)
token_set(token, operators[i].id, operators[i].str);
return 1;
}
return 0;
}
static int is_quote_start(const char *buf, size_t *buf_pos, const ts_quote **quote)
{
for (register int i = 0; quotes[i].start; ++i)
if (!strncmp(buf + *buf_pos, quotes[i].start, quotes[i].lenstart)) {
*buf_pos += quotes[i].lenstart - 1;
if (quote)
*quote = quotes + i;
return 1;
}
return 0;
}
static int is_quote_stop(const char *buf, size_t *buf_pos, const ts_quote *quote)
{
if (!strncmp(buf + *buf_pos, quote->stop, quote->lenstop)) {
*buf_pos += quote->lenstop - 1;
return 1;
}
return 0;
}