phabricator/src/aphront/multipartparser/AphrontMultipartParser.php

<?php

final class AphrontMultipartParser extends Phobject {

  private $contentType;
  private $boundary;

  private $buffer;
  private $body;
  private $state;

  private $part;
  private $parts;

  public function setContentType($content_type) {
    $this->contentType = $content_type;
    return $this;
  }

  public function getContentType() {
    return $this->contentType;
  }

  public function beginParse() {
    $content_type = $this->getContentType();
    if ($content_type === null) {
      throw new PhutilInvalidStateException('setContentType');
    }

    if (!preg_match('(^multipart/form-data)', $content_type)) {
      throw new Exception(
        pht(
          'Expected "multipart/form-data" content type when executing a '.
          'multipart body read.'));
    }

    $type_parts = preg_split('(\s*;\s*)', $content_type);
    $boundary = null;
    foreach ($type_parts as $type_part) {
      $matches = null;
      if (preg_match('(^boundary=(.*))', $type_part, $matches)) {
        $boundary = $matches[1];
        break;
      }
    }

    if ($boundary === null) {
      throw new Exception(
        pht('Received "multipart/form-data" request with no "boundary".'));
    }

    $this->parts = array();
    $this->part = null;

    $this->buffer = '';
    $this->boundary = $boundary;

    // We're looking for a (usually empty) body before the first boundary.
    $this->state = 'bodynewline';
  }

  public function continueParse($bytes) {
    $this->buffer .= $bytes;

    $continue = true;
    while ($continue) {
      switch ($this->state) {
        case 'endboundary':
          // We've just parsed a boundary. Next, we expect either "--" (which
          // indicates we've reached the end of the parts) or "\r\n" (which
          // indicates we should read the headers for the next part).

          if (strlen($this->buffer) < 2) {
            // We don't have enough bytes yet, so wait for more.
            $continue = false;
            break;
          }

          if (!strncmp($this->buffer, '--', 2)) {
            // This is "--" after a boundary, so we're done. We'll read the
            // rest of the body (the "epilogue") and discard it.
            $this->buffer = substr($this->buffer, 2);
            $this->state = 'epilogue';

            $this->part = null;
            break;
          }

          if (!strncmp($this->buffer, "\r\n", 2)) {
            // This is "\r\n" after a boundary, so we're going to going to
            // read the headers for a part.
            $this->buffer = substr($this->buffer, 2);
            $this->state = 'header';

            // Create the object to hold the part we're about to read.
            $part = new AphrontMultipartPart();
            $this->parts[] = $part;
            $this->part = $part;
            break;
          }

          throw new Exception(
            pht('Expected "\r\n" or "--" after multipart data boundary.'));
        case 'header':
          // We've just parsed a boundary, followed by "\r\n". We are going
          // to read the headers for this part. They are in the form of HTTP
          // headers and terminated by "\r\n". The section is terminated by
          // a line with no header on it.

          if (strlen($this->buffer) < 2) {
            // We don't have enough data to find a "\r\n", so wait for more.
            $continue = false;
            break;
          }

          if (!strncmp("\r\n", $this->buffer, 2)) {
            // This line immediately began "\r\n", so we're done with parsing
            // headers. Start parsing the body.
            $this->buffer = substr($this->buffer, 2);
            $this->state = 'body';
            break;
          }

          // This is an actual header, so look for the end of it.
          $header_len = strpos($this->buffer, "\r\n");
          if ($header_len === false) {
            // We don't have a full header yet, so wait for more data.
            $continue = false;
            break;
          }

          $header_buf = substr($this->buffer, 0, $header_len);
          $this->part->appendRawHeader($header_buf);

          $this->buffer = substr($this->buffer, $header_len + 2);
          break;
        case 'body':
          // We've parsed a boundary and headers, and are parsing the data for
          // this part. The data is terminated by "\r\n--", then the boundary.

          // We'll look for "\r\n", then switch to the "bodynewline" state if
          // we find it.

          $marker = "\r";
          $marker_pos = strpos($this->buffer, $marker);

          if ($marker_pos === false) {
            // There's no "\r" anywhere in the buffer, so we can just read it
            // as provided. Then, since we read all the data, we're done until
            // we get more.

            // Note that if we're in the preamble, we won't have a "part"
            // object and will just discard the data.
            if ($this->part) {
              $this->part->appendData($this->buffer);
            }
            $this->buffer = '';
            $continue = false;
            break;
          }

          if ($marker_pos > 0) {
            // If there are bytes before the "\r",
            if ($this->part) {
              $this->part->appendData(substr($this->buffer, 0, $marker_pos));
            }
            $this->buffer = substr($this->buffer, $marker_pos);
          }

          $expect = "\r\n";
          $expect_len = strlen($expect);
          if (strlen($this->buffer) < $expect_len) {
            // We don't have enough bytes yet to know if this is "\r\n"
            // or not.
            $continue = false;
            break;
          }

          if (strncmp($this->buffer, $expect, $expect_len)) {
            // The next two bytes aren't "\r\n", so eat them and go looking
            // for more newlines.
            if ($this->part) {
              $this->part->appendData(substr($this->buffer, 0, $expect_len));
            }
            $this->buffer = substr($this->buffer, $expect_len);
            break;
          }

          // Eat the "\r\n".
          $this->buffer = substr($this->buffer, $expect_len);
          $this->state = 'bodynewline';
          break;
        case 'bodynewline':
          // We've parsed a newline in a body, or we just started parsing the
          // request. In either case, we're looking for "--", then the boundary.
          // If we find it, this section is done. If we don't, we consume the
          // bytes and move on.

          $expect = '--'.$this->boundary;
          $expect_len = strlen($expect);

          if (strlen($this->buffer) < $expect_len) {
            // We don't have enough bytes yet, so wait for more.
            $continue = false;
            break;
          }

          if (strncmp($this->buffer, $expect, $expect_len)) {
            // This wasn't the boundary, so return to the "body" state and
            // consume it. (But first, we need to append the "\r\n" which we
            // ate earlier.)
            if ($this->part) {
              $this->part->appendData("\r\n");
            }
            $this->state = 'body';
            break;
          }

          // This is the boundary, so toss it and move on.
          $this->buffer = substr($this->buffer, $expect_len);
          $this->state = 'endboundary';
          break;
        case 'epilogue':
          // We just discard any epilogue.
          $this->buffer = '';
          $continue = false;
          break;
        default:
          throw new Exception(
            pht(
              'Unknown parser state "%s".\n',
              $this->state));
      }
    }
  }

  public function endParse() {
    if ($this->state !== 'epilogue') {
      throw new Exception(
        pht(
          'Expected "multipart/form-data" parse to end '.
          'in state "epilogue".'));
    }

    return $this->parts;
  }


}