kcgi | Using Pages

Using Pages

Ross Richardson

20 September, 2017

Thanks to Ross Richardson's fine work in contributing this tutorial!

In order to facilitate convenient handling of common cases, kcgi provides functionality for dealing with the CGI meta variable PATH_INFO). For example, if /cgi-bin/foo is the CGI script, invoking /cgi-bin/foo/bar/baz will pass /bar/baz as additional information. Many CGI scripts use this functionality as URL normalisation, or pushing query-string variables into the path.

This tutorial describes an example CGI which implements a news site devoted to some particular topic. The default document shows an index page, and there are sections for particular relevant areas. In each of these, the trailing slash may be included or omitted. I assume that your script is available at /cgi-bin/news.

/cgi-bin/news, /cgi-bin/news/index: main index
/cgi-bin/news/about/: about the site
/cgi-bin/news/archive/: archive of old articles
/cgi-bin/news/archive/yyyy: archive/index of articles for year yyyy
/cgi-bin/news/archive/yyyy/mm: archive/index of articles for month mm of year yyyy
/cgi-bin/news/archive/yyyy/mm/dd: archive/index of articles for date yyyy-mm-dd
/cgi-bin/news/random: a random article
/cgi-bin/news/tag/subj: articles tagged with "subj"

Basic Handling

Assuming a call to khttp_parse(3) returns KCGI_OK, the relevant fields of the struct kreq are:

fullpath

the value of CGI meta variable PATH_INFO (which may be the empty string)

pagename

the substring of PATH_INFO from after the initial '/' to (but excluding) the next '/', or to the end-of-string (or the empty string if no such substring exists)

page

if pagename is the empty string, the defpage parameter passed to khttp_parse(3) (that is, the index corrsponding to the default page)
if pagename matches one of the strings in the pages parameter passed to khttp_parse(3), the index of that string
if pagename does not match any of the strings in pages, the pagesz parameter passed to khttp_parse(3)

path

the middle part of PATH_INFO after stripping pagename/ at the beginning and .suffix at the end.

In addition, the field pname contains the value of the CGI meta variable SCRIPT_NAME.

Source Code

Here we look only at the code snippets not covered by the earlier tutorials. Firstly, we define some values corresponding with the subsections of the site.

enum pg {
  PG_INDEX,
  PG_ABOUT,
  PG_ARCHIVE,
  PG_RANDOM,
  PG_TAG,
  PG__MAX
};

Next, we define the path strings corresponding with the enumeration values

static const char *pages[PG__MAX] = {
  "index",
  "about",
  "archive",
  "random",
  "tag"
};

We then define a constant bitmap corresponding with those enum pg values for which no extra path information should be present in the HTTP request. This will be used for sanity-checking the request.

const size_t pg_no_extra_permitted =
  ((1 << PG_INDEX) | 
   (1 << PG_ABOUT) | 
   (1 << PG_RANDOM));

Next, we define a type for dates, a constant for the earliest valid year, functions for parsing a string specifying a date. We use year zero to indicate an invalid specification, and month/day zero to indicate that a month/day value was not specified.)

Editor's note: remember that strptime(3) and friends may not be available within a file-system sandbox due to time-zone access, so we need to find another way.

struct adate {
  unsigned int year; /* 0 if invalid */
  unsigned int month; /* 0 if not specified */
  unsigned int day; /* 0 if not specified */
};

const unsigned int  archive_first_yr = 1995;

static unsigned int
current_year(void)
{
  struct tm *t;
  time_t now;

  if ((now = time(NULL)) == (time_t)-1 || 
      (t = gmtime(&now)) == NULL)
    exit(EXIT_FAILURE);

  return t->tm_year + 1900;
} /* current_year */

static unsigned int
month_length(unsigned int y, unsigned int m)
{
  unsigned int len;

  switch (m) {
    case 2:
      if (y % 4 == 0 && (y % 100 != 0 || y % 400 == 0))
        len = 29;
      else
        len = 28;
      break;
    case 1:
    case 3: 
    case 5: 
    case 7:
    case 8: 
    case 10: 
    case 12:
      len = 31;
      break;
    case 4: 
    case 6: 
    case 9: 
    case 11:
      len = 30;
      break;
    default:
      exit(EXIT_FAILURE);
    }
    return len;
} /* month_length */

static void
str_to_adate(const char* s, char sep, struct adate *d)
{
  long long val;
  char *t, *a, *b;
  size_t i;

  /* Set error/default state until proven otherwise. */
  d->year = 0;
  d->month = 0;
  d->day = 0;

  i = 0;
  while (isdigit((unsigned char)s[i]) || s[i] == sep)
    i++;

  if (i > 0 && s[i] == '\0') {
    /* s consists of digits and sep characters only. */
    /* Make a copy with which is is safe to tamper. */
    t = kstrdup(s);
    a = t;
    if ((b = strchr(a, sep)) != NULL)
      *b = '\0';
    val = strtonum(a, archive_first_yr, current_year(), NULL);
    if (val != 0) {
      /* Year is OK. */
      d->year = val;
      if (b != NULL && b[1] != '\0') {
        /* Move on to month. */
        a = &b[1];
        if ((b = strchr(a, sep)) != NULL)
          *b = '\0';
        val = strtonum(a, 1, 12, NULL);
        if (val == 0) {
          d->year = 0;
        } else {
          d->month = val;
          if (b != NULL && b[1] != '\0') {
            /* Move on to day. */
            a = &b[1];
            if ((b = strchr(a, sep)) != NULL)
              *b = '\0';
	    if ((b != NULL && b[1] != '\0') || 
	        (val = strtonum(a, 1, month_length
	         (d->year, d->month), NULL)) == 0) {
              d->year  = 0;
              d->month = 0;
            } else {
              d->day   = val;
            }
          }
        }
      }
    }
    free(t);
  }
} /* str_to_adate */

Now, we consider the basic handling of the request.

int
main(void) {
  struct kreq r;
  struct adate ad;
  struct kpair *p;

  if (khttp_parse(&r, NULL, 0,
      pages, PG__MAX, PG_INDEX) != KCGI_OK)
    return 0 /* abort */;

  if (r.mime != KMIME_TEXT_HTML) {
    handle_err(&r, KHTTP_404);
  } else if (r.method != KMETHOD_GET && 
             r.method != KMETHOD_HEAD) {
    handle_err(&r, KHTTP_405);
  } else if (r.page == PG__MAX || 
            (r.path[0] != '\0' &&
             ((1 << r.page) & pg_no_extra_permitted))) {
    handle_err(&r, KHTTP_404);
  } else {
    switch (r.page) {
      case PG_INDEX :
        handle_index(&r);
        break;
      case PG_ABOUT :
        handle_about(&r);
        break;
      case PG_ARCHIVE :
        if (r.path != NULL && r.path[0] != '\0') {
          str_to_adate(r.path, '/', &ad);
          if (ad.year != 0) {
            handle_archive(&r, &ad);
          } else {
            handle_err(&r, KHTTP_404);
          }
        } else {
          /* Not specified at all. */
          handle_archive(&r, NULL);
        }
        break;
      case PG_RANDOM :
        handle_random(&r);
        break;
      case PG_TAG :
        handle_tag(&r, r.path);
        break;
      default :
        /* shouldn't happen */
        handle_err(&r, KHTTP_500);
        break;
      }
    }
    khttp_free(&r);
    return EXIT_SUCCESS;
}

Suppose we now decide that we wish to fall back to looking for a date specification (with '-' separators rather than '/') in the query string if none is specified in the path. This is as simple as adding the required definition…

enum key {
  KEY_ADATE,
  KEY__MAX
};

…and adding a validator function…

static int
valid_adate(struct kpair* kp)
{
  struct adate ad;
  int ok;

  /* Invalid until proven otherwise. */
  ok = 0;

  if (kvalid_stringne(kp)) {
    str_to_adate(kp->val, '-', &ad);
    if (ad.year != 0) {
      /* We have a valid specification. */
      kp->type = KPAIR__MAX  /* Not a simple type. */;
      kp->valsz = sizeof(ad);
      kp->val   = kmalloc(kp->valsz);
      ((struct adate*)kp->val)->year  = ad.year;
      ((struct adate*)kp->val)->month = ad.month;
      ((struct adate*)kp->val)->day   = ad.day;
      ok = 1;
    }
  }
  return ok;
} /* valid_adate */

static const struct kvalid keys[KEY__MAX] = {
  { valid_adate, "adate" }  /* KEY_ADATE */
};

(Note that the same date parsing function, str_to_adate(), is used but in this case it is wrapped in a validator function and thus executes in the sandboxed environment.)

…and, in main(), modifying the call to khttp_parse(3)…

if (khttp_parse(&r, keys, KEY__MAX,
      pages, PG__MAX, PG_INDEX) != KCGI_OK) {
  khttp_free(&r);
  return EXIT_FAILURE  /* abort */;
}

…and handling of the PG_ARCHIVE case…

case PG_ARCHIVE :
  if (r.path != NULL && r.path[0] != '\0') {
    str_to_adate(r.path, '/', &ad);
    if (ad.year != 0)
      handle_archive(&r, &ad);
    else
      handle_err(&r, KHTTP_404);
  } else if (r.fieldmap[KEY_ADATE] != NULL) {
    /* Fallback to field. */
    handle_archive(&r, (struct adate*)r.fieldmap[KEY_ADATE]->val);
  } else if (r.fieldnmap[KEY_ADATE] != NULL) {
    /* Field is invalid. */
    handle_err(&r, KHTTP_404);
  } else {
    /* Not specified at all. */
    handle_archive(&r, NULL);
  }
  break;

Whilst some specifications are naturally suited to the use of path information (for example, dates, file system hierarchies, and timezones), others are are a less natural fit. Suppose, in our example, that we want to be able to specify a date and a tag at the same time. This could be achieved by extending the behaviour of the archive or tag "page", but does not fit comfortably with either. In general, use of query string keys is preferred over pages because the former:

involve parsing/validation in a sandboxed environment
allows for greater flexibility

Editor's note: Ross makes a good case for putting some sort of handling facility for URLs into the protected child process. For example, we could pass a string into khttp_parsex(3) that would define a template for splitting the path into arguments. For example, /@@0@@/@@1@@/@@2@@ might consider a pathname matching /foo/bar/baz with components being validated as query arguments.