Logo Search packages:      
Sourcecode: jade version File versions  Download package

parseCommon.cxx

// Copyright (c) 1994 James Clark
// See the file COPYING for copying permission.

#include "splib.h"
#include "Parser.h"
#include "token.h"
#include "MessageArg.h"
#include "ParserMessages.h"
#include "constant.h"
#include "NumericCharRefOrigin.h"
#include "macros.h"

#ifdef SP_NAMESPACE
namespace SP_NAMESPACE {
#endif

Boolean Parser::parseProcessingInstruction()
{
  currentInput()->startToken();
  Location location(currentLocation());
  StringC buf;
  for (;;) {
    Token token = getToken(piMode);
    if (token == tokenPic)
      break;
    switch (token) {
    case tokenEe:
      message(ParserMessages::processingInstructionEntityEnd);
      return 0;
    case tokenUnrecognized:
      reportNonSgmlCharacter();
      // fall through
    case tokenChar:
      buf += *currentInput()->currentTokenStart();
      if (buf.size()/2 > syntax().pilen()) {
      message(ParserMessages::processingInstructionLength,
            NumberMessageArg(syntax().pilen()));
      message(ParserMessages::processingInstructionClose);
      return 0;
      }
      break;
    }
  }
  if (buf.size() > syntax().pilen())
    message(ParserMessages::processingInstructionLength,
          NumberMessageArg(syntax().pilen()));
  if (options().warnPiMissingName) {
    size_t i = 0;
    if (buf.size() && syntax().isNameStartCharacter(buf[0])) {
      for (i = 1; i < buf.size(); i++)
      if (!syntax().isNameCharacter(buf[i]))
        break;
    }
    if (i == 0 || (i < buf.size() && !syntax().isS(buf[i])))
      message(ParserMessages::piMissingName);
  }
  noteMarkup();
  eventHandler().pi(new (eventAllocator()) ImmediatePiEvent(buf, location));
  return 1;
}

Boolean Parser::parseLiteral(Mode litMode,
                       Mode liteMode,
                       size_t maxLength,
                       const MessageType1 &tooLongMessage,
                       unsigned flags,
                       Text &text)
{
  unsigned startLevel = inputLevel();
  Mode currentMode = litMode;
  // If the literal gets to be longer than this, then we assume
  // that the closing delimiter has been omitted if we're at the end
  // of a line and at the starting input level.
  size_t reallyMaxLength = (maxLength > size_t(-1)/2
                      ? size_t(-1)
                      : maxLength * 2);
  text.clear();
  Location startLoc(currentLocation());
  if (flags & literalDelimInfo)
    text.addStartDelim(currentLocation());
  for (;;) {
    Token token = getToken(currentMode);
    switch (token) {
    case tokenEe:
      if (inputLevel() == startLevel) {
      message(ParserMessages::literalLevel);
      return 0;
      }
      text.addEntityEnd(currentLocation());
      popInputStack();
      if (inputLevel() == startLevel)
      currentMode = litMode;
      break;
    case tokenUnrecognized:
      if (reportNonSgmlCharacter())
      break;
      message(ParserMessages::literalMinimumData,
            StringMessageArg(currentToken()));
      break;
    case tokenRs:
      text.ignoreChar(currentChar(), currentLocation());
      break;
    case tokenRe:
      if (text.size() > reallyMaxLength && inputLevel() == startLevel) {
#if 0
      message(tooLongMessage, NumberMessageArg(maxLength));
#endif
      // guess that the closing delimiter has been omitted
      Messenger::setNextLocation(startLoc);
      message(ParserMessages::literalClosingDelimiter);
      return 0;
      }
      // fall through
    case tokenSepchar:
      if ((flags & literalSingleSpace)
        && (text.size() == 0 || text.lastChar() == syntax().space()))
      text.ignoreChar(currentChar(), currentLocation());
      else
      text.addChar(syntax().space(),
                 Location(new ReplacementOrigin(currentLocation(),
                                        currentChar()),
                        0));
      break;
    case tokenSpace:
      if ((flags & literalSingleSpace)
        && (text.size() == 0 || text.lastChar() == syntax().space()))
      text.ignoreChar(currentChar(), currentLocation());
      else
      text.addChar(currentChar(), currentLocation());
      break;
    case tokenCroDigit:
    case tokenHcroHexDigit:
      {
      Char c;
      Location loc;
      if (!parseNumericCharRef(token== tokenHcroHexDigit, c, loc))
        return 0;
      Boolean isSgmlChar;
      if (!translateNumericCharRef(c, isSgmlChar))
        break;
      if (!isSgmlChar) {
        if (flags & literalNonSgml)
          text.addNonSgmlChar(c, loc);
        else
          message(ParserMessages::numericCharRefLiteralNonSgml,
                  NumberMessageArg(c));
        break;
      }
      if (flags & literalDataTag) {
        if (!syntax().isSgmlChar(c))
          message(ParserMessages::dataTagPatternNonSgml);
        else if (syntax().charSet(Syntax::functionChar)->contains(c))
          message(ParserMessages::dataTagPatternFunction);
      }
      if ((flags & literalSingleSpace)
          && c == syntax().space()
          && (text.size() == 0 || text.lastChar() == syntax().space()))
        text.ignoreChar(c, loc);
      else
        text.addChar(c, loc);
      }
      break;
    case tokenCroNameStart:
      if (!parseNamedCharRef())
      return 0;
      break;
    case tokenEroGrpo:
      message(inInstance() ? ParserMessages::eroGrpoStartTag : ParserMessages::eroGrpoProlog);
      break;
    case tokenLit:
    case tokenLita:
      if (flags & literalDelimInfo)
      text.addEndDelim(currentLocation(), token == tokenLita);
      goto done;
    case tokenPeroNameStart:
      if (options().warnInternalSubsetLiteralParamEntityRef
        && inputLevel() == 1)
      message(ParserMessages::internalSubsetLiteralParamEntityRef);
      // fall through
    case tokenEroNameStart:
      {
      ConstPtr<Entity> entity;
      Ptr<EntityOrigin> origin;
      if (!parseEntityReference(token == tokenPeroNameStart,
                          (flags & literalNoProcess) ? 2 : 0,
                          entity, origin))
        return 0;
      if (!entity.isNull())
        entity->litReference(text, *this, origin,
                         (flags & literalSingleSpace) != 0);
      if (inputLevel() > startLevel)
        currentMode = liteMode;
      }
      break;
    case tokenPeroGrpo:
      message(ParserMessages::peroGrpoProlog);
      break;
    case tokenCharDelim:
      message(ParserMessages::dataCharDelim,
            StringMessageArg(StringC(currentInput()->currentTokenStart(),
                               currentInput()->currentTokenLength())));
      // fall through
    case tokenChar:
      if (text.size() > reallyMaxLength && inputLevel() == startLevel
        && currentChar() == syntax().standardFunction(Syntax::fRE)) {
#if 0
      message(tooLongMessage, NumberMessageArg(maxLength));
#endif
      // guess that the closing delimiter has been omitted
      Messenger::setNextLocation(startLoc);
      message(ParserMessages::literalClosingDelimiter);
      return 0;
      }
      text.addChar(currentChar(), currentLocation());
      break;
    }
  }
 done:
  if ((flags & literalSingleSpace)
      && text.size() > 0
      && text.lastChar() == syntax().space())
    text.ignoreLastChar();
  if (text.size() > maxLength) {
    switch (litMode) {
    case alitMode:
    case alitaMode:
    case talitMode:
    case talitaMode:
      if (AttributeValue::handleAsUnterminated(text, *this))
      return 0;
    default:
      break;
    }
    message(tooLongMessage, NumberMessageArg(maxLength));
  }
  return 1;
}

Boolean Parser::parseNamedCharRef()
{
  if (options().warnNamedCharRef)
    message(ParserMessages::namedCharRef);
  InputSource *in = currentInput();
  Index startIndex = currentLocation().index();
  in->discardInitial();
  extendNameToken(syntax().namelen(), ParserMessages::nameLength);
  Char c;
  Boolean valid;
  StringC name;
  getCurrentToken(syntax().generalSubstTable(), name);
  if (!syntax().lookupFunctionChar(name, &c)) {
    message(ParserMessages::functionName, StringMessageArg(name));
    valid = 0;
  }
  else {
    valid = 1;
    if (wantMarkup())
      getCurrentToken(name);  // the original name
  }
  NamedCharRef::RefEndType refEndType;
  switch (getToken(refMode)) {
  case tokenRefc:
    refEndType = NamedCharRef::endRefc;
    break;
  case tokenRe:
    refEndType = NamedCharRef::endRE;
    if (options().warnRefc)
      message(ParserMessages::refc);
    break;
  default:
    refEndType = NamedCharRef::endOmitted;
    if (options().warnRefc)
      message(ParserMessages::refc);
    break;
  }
  in->startToken();
  if (valid)
    in->pushCharRef(c, NamedCharRef(startIndex, refEndType, name));
  return 1;
}

Boolean Parser::parseNumericCharRef(Boolean isHex, Char &ch, Location &loc)
{
  InputSource *in = currentInput();
  Location startLocation = currentLocation();
  in->discardInitial();
  Boolean valid = 1;
  Char c = 0;
  if (isHex) {
    extendHexNumber();
    const Char *lim = in->currentTokenEnd();
    for (const Char *p = in->currentTokenStart(); p < lim; p++) {
      int val = sd().hexDigitWeight(*p);
      if (c <= charMax/16 && (c *= 16) <= charMax - val)
      c += val;
      else {
      message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
      valid = 0;
      break;
      }
    }
  }
  else {
    extendNumber(syntax().namelen(), ParserMessages::numberLength);
    const Char *lim = in->currentTokenEnd();
    for (const Char *p = in->currentTokenStart(); p < lim; p++) {
      int val = sd().digitWeight(*p);
      if (c <= charMax/10 && (c *= 10) <= charMax - val)
      c += val;
      else {
      message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
      valid = 0;
      break;
      }
    }
  }
  if (valid && !sd().docCharsetDecl().charDeclared(c)) {
    valid = 0;
    message(ParserMessages::characterNumber, StringMessageArg(currentToken()));
  }
  Owner<Markup> markupPtr;
  if (wantMarkup()) {
    markupPtr = new Markup;
    markupPtr->addDelim(isHex ? Syntax::dHCRO : Syntax::dCRO);
    markupPtr->addNumber(in);
    switch (getToken(refMode)) {
    case tokenRefc:
      markupPtr->addDelim(Syntax::dREFC);
      break;
    case tokenRe:
      markupPtr->addRefEndRe();
      if (options().warnRefc)
      message(ParserMessages::refc);
      break;
    default:
      if (options().warnRefc)
      message(ParserMessages::refc);
      break;
    }
  }
  else if (options().warnRefc) {
    if (getToken(refMode) != tokenRefc)
      message(ParserMessages::refc);
  }
  else
    (void)getToken(refMode);
  if (valid) {
    ch = c;
    loc = Location(new NumericCharRefOrigin(startLocation,
                                  currentLocation().index()
                                  + currentInput()->currentTokenLength()
                                  - startLocation.index(),
                                  markupPtr),
               0);
  }
  return valid;
}

// Translate a character number in the document character set
// into the internal character set.
// If it's a non-SGML char (ie described as UNUSED in SGML declaration),
// return 1 and set sgmlChar to 0.

Boolean Parser::translateNumericCharRef(Char &ch, Boolean &isSgmlChar)
{
  if (sd().internalCharsetIsDocCharset()) {
    if (options().warnNonSgmlCharRef && !syntax().isSgmlChar(ch))
      message(ParserMessages::nonSgmlCharRef);
    isSgmlChar = 1;
    return 1;
  }
  UnivChar univChar;
  if (!sd().docCharset().descToUniv(ch, univChar)) {
    const PublicId *pubid;
    CharsetDeclRange::Type type;
    Number n;
    StringC desc;
    if (sd().docCharsetDecl().getCharInfo(ch, pubid, type, n, desc)) {
      if (type == CharsetDeclRange::unused) {
      if (options().warnNonSgmlCharRef)
        message(ParserMessages::nonSgmlCharRef);
      isSgmlChar = 0;
      return 1;
      }
    }
    else
      CANNOT_HAPPEN();
    if (type == CharsetDeclRange::string)
      message(ParserMessages::numericCharRefUnknownDesc,
            NumberMessageArg(ch),
            StringMessageArg(desc));
    else
      message(ParserMessages::numericCharRefUnknownBase,
            NumberMessageArg(ch),
            NumberMessageArg(n),
            StringMessageArg(pubid->string()));
  }
  else {
    WideChar resultChar;
    ISet<WideChar> resultChars;
    switch (sd().internalCharset().univToDesc(univChar,
                                    resultChar,
                                    resultChars)) {
    case 1:
      if (resultChar <= charMax) {
      isSgmlChar = 1;
      ch = Char(resultChar);
      return 1;
      }
      // fall through
    case 2:
      message(ParserMessages::numericCharRefBadInternal,
            NumberMessageArg(ch));
      break;
    default:
      message(ParserMessages::numericCharRefNoInternal,
            NumberMessageArg(ch));
      break;
    }
  }
  return 0;
}

// ignoreLevel: 0 means don't ignore;
// 1 means parse name group and ignore if inactive
// 2 means ignore

Boolean Parser::parseEntityReference(Boolean isParameter,
                             int ignoreLevel,
                             ConstPtr<Entity> &entity,
                             Ptr<EntityOrigin> &origin)
{
  InputSource *in = currentInput();
  Location startLocation(in->currentLocation());
  Owner<Markup> markupPtr;
  if (wantMarkup()) {
    markupPtr = new Markup;
    markupPtr->addDelim(isParameter ? Syntax::dPERO : Syntax::dERO);
  }
  if (ignoreLevel == 1) {
    Markup savedMarkup;
    Markup *savedCurrentMarkup = currentMarkup();
    if (savedCurrentMarkup)
      savedCurrentMarkup->swap(savedMarkup);
    Location savedMarkupLocation(markupLocation());
    startMarkup(markupPtr != 0, startLocation);
    if (markupPtr) {
      markupPtr->addDelim(Syntax::dGRPO);
      markupPtr->swap(*currentMarkup());
    }
    Boolean ignore;
    if (!parseEntityReferenceNameGroup(ignore))
      return 0;
    if (markupPtr)
      currentMarkup()->swap(*markupPtr);
    startMarkup(savedCurrentMarkup != 0, savedMarkupLocation);
    if (savedCurrentMarkup)
      savedMarkup.swap(*currentMarkup());
    if (!ignore)
      ignoreLevel = 0;
    in->startToken();
    Xchar c = in->tokenChar(messenger());
    if (!syntax().isNameStartCharacter(c)) {
      message(ParserMessages::entityReferenceMissingName);
      return 0;
    }
  }
  in->discardInitial();
  if (isParameter)
    extendNameToken(syntax().penamelen(), ParserMessages::parameterEntityNameLength);
  else
    extendNameToken(syntax().namelen(), ParserMessages::nameLength);
  StringC &name = nameBuffer();
  getCurrentToken(syntax().entitySubstTable(), name);
  if (ignoreLevel)
    entity = new IgnoredEntity(name,
                         isParameter
                         ? Entity::parameterEntity
                         : Entity::generalEntity);
  else {
    entity = lookupEntity(isParameter, name, startLocation, 1);
    if (entity.isNull()) {
      if (haveApplicableDtd()) {
      if (!isParameter) {
        entity = createUndefinedEntity(name, startLocation);
        message(ParserMessages::entityUndefined,
              StringMessageArg(name));
      }
      else 
        message(ParserMessages::parameterEntityUndefined,
              StringMessageArg(name));
      }
      else
      message(ParserMessages::entityApplicableDtd);
    }
    else if (entity->defaulted() && options().warnDefaultEntityReference)
      message(ParserMessages::defaultEntityReference, StringMessageArg(name));
  }
  if (markupPtr) {
    markupPtr->addName(in);
    switch (getToken(refMode)) {
    case tokenRefc:
      markupPtr->addDelim(Syntax::dREFC);
      break;
    case tokenRe:
      markupPtr->addRefEndRe();
      if (options().warnRefc)
      message(ParserMessages::refc);
      break;
    default:
      if (options().warnRefc)
      message(ParserMessages::refc);
      break;
    }
  }
  else if (options().warnRefc) {
    if (getToken(refMode) != tokenRefc)
      message(ParserMessages::refc);
  }
  else
    (void)getToken(refMode);
  if (!entity.isNull())
    origin = EntityOrigin::make(internalAllocator(),
                        entity,
                        startLocation,
                        currentLocation().index()
                        + currentInput()->currentTokenLength()
                        - startLocation.index(),
                        markupPtr);
  else
    origin = (EntityOrigin *)0;
  return 1;
}

Boolean Parser::parseComment(Mode mode)
{
  Location startLoc(currentLocation());
  Markup *markup = currentMarkup();
  if (markup)
    markup->addCommentStart();
  Token token;
  while ((token = getToken(mode)) != tokenCom)
    switch (token) {
    case tokenUnrecognized:
      if (!reportNonSgmlCharacter())
      message(ParserMessages::sdCommentSignificant,
            StringMessageArg(currentToken()));
      break;
    case tokenEe:
      message(ParserMessages::commentEntityEnd, startLoc);
      return 0;
    default:
      if (markup)
      markup->addCommentChar(currentChar());
      break;
    }
  return 1;
}

void Parser::extendNameToken(size_t maxLength,
                       const MessageType1 &tooLongMessage)
{
  InputSource *in = currentInput();
  size_t length = in->currentTokenLength();
  const Syntax &syn = syntax();
  while (syn.isNameCharacter(in->tokenChar(messenger())))
    length++;
  if (length > maxLength)
    message(tooLongMessage, NumberMessageArg(maxLength));
  in->endToken(length);
}

void Parser::extendNumber(size_t maxLength, const MessageType1 &tooLongMessage)
{
  InputSource *in = currentInput();
  size_t length = in->currentTokenLength();
  while (syntax().isDigit(in->tokenChar(messenger())))
    length++;
  if (length > maxLength)
    message(tooLongMessage, NumberMessageArg(maxLength));
  in->endToken(length);
}

void Parser::extendHexNumber()
{
  InputSource *in = currentInput();
  size_t length = in->currentTokenLength();
  while (syntax().isHexDigit(in->tokenChar(messenger())))
    length++;
  if (length > syntax().namelen())
    message(ParserMessages::hexNumberLength, NumberMessageArg(syntax().namelen()));
  in->endToken(length);
}

Boolean Parser::reportNonSgmlCharacter()
{
  // In scanSuppress mode the non-SGML character will have been read.
  Char c = currentInput()->currentTokenLength() ? currentChar() : getChar();
  if (!syntax().isSgmlChar(c)) {
    message(ParserMessages::nonSgmlCharacter, NumberMessageArg(c));
    return 1;
  }
  return 0;
}

void Parser::extendS()
{
  InputSource *in = currentInput();
  size_t length = in->currentTokenLength();
  while (syntax().isS(in->tokenChar(messenger())))
    length++;
  in->endToken(length);
}

#ifdef SP_NAMESPACE
}
#endif

Generated by  Doxygen 1.6.0   Back to index