diff --git a/include/tidyenum.h b/include/tidyenum.h index e38a63e71..9b752cc00 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -169,6 +169,7 @@ extern "C" { FN(BACKSLASH_IN_URI) \ FN(BAD_ATTRIBUTE_VALUE_REPLACED) \ FN(BAD_ATTRIBUTE_VALUE) \ + FN(ATTRIBUTE_VALUE_REPLACED) \ FN(ESCAPED_ILLEGAL_URI) \ FN(FIXED_BACKSLASH) \ FN(ID_NAME_MISMATCH) \ @@ -258,14 +259,15 @@ extern "C" { FN(REMOVED_HTML5) \ FN(XML_DECLARATION_DETECTED) \ /* Report, mixed use */ \ + FN(ADDED_MISSING_CHARSET) \ FN(COERCE_TO_ENDTAG) \ FN(ELEMENT_NOT_EMPTY) \ + FN(FOUND_STYLE_IN_BODY) \ + FN(MOVED_STYLE_TO_HEAD) \ FN(UNEXPECTED_END_OF_FILE) \ FN(UNEXPECTED_ENDTAG) \ - FN(UNEXPECTED_ENDTAG_ERR) \ - FN(MOVED_STYLE_TO_HEAD) \ - FN(FOUND_STYLE_IN_BODY) - + FN(UNEXPECTED_ENDTAG_ERR) + /** These are report messages added by Tidy's accessibility module. ** Note that commented out items don't have checks for them at this time, @@ -589,6 +591,7 @@ typedef enum TidyMergeDivs, /**< Merge multiple DIVs */ TidyMergeEmphasis, /**< Merge nested B and I elements */ TidyMergeSpans, /**< Merge multiple SPANs */ + TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */ #if SUPPORT_ASIAN_ENCODINGS TidyNCR, /**< Allow numeric character references */ #else @@ -620,11 +623,12 @@ typedef enum TidyShowErrors, /**< Number of errors to put out */ TidyShowInfo, /**< If true, info-level messages are shown */ TidyShowMarkup, /**< If false, normal output is suppressed */ + TidyShowMetaChange, /**< show when meta http-equiv content charset was changed - compatibility */ TidyShowWarnings, /**< However errors are always shown */ TidySkipNested, /**< Skip nested tags in script and style CDATA */ TidySortAttributes, /**< Sort attributes */ TidyStrictTagsAttr, /**< Ensure tags and attributes match output HTML version */ - TidyStyleTags, /**< Move style to head */ + TidyStyleTags, /**< Move sytle to head */ TidyTabSize, /**< Expand tabs to n spaces */ TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */ TidyUpperCaseTags, /**< Output tags in upper not lower case */ diff --git a/src/attrs.h b/src/attrs.h index e5b0fa975..0192efcbd 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN ) #define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING ) #define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING ) +#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) #define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR ) #define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF ) #define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) @@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT ) #define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR ) #define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED ) +#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET ) #define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED ) #define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG ) #define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET ) diff --git a/src/clean.c b/src/clean.c index 7b56f34ac..707e4d90a 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2208,6 +2208,9 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) } #endif +/* Issue #456 - This is discarded + See replacement TidyMetaCharset */ +#if 0 /* 000000000000000000000000 */ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { Node *pNode; @@ -2283,6 +2286,222 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) pLastProp = NULL; } } +#endif /* 000000000000000000000000 */ + +/*\ +* Issue #456 - Check meta charset +* 1. if there is no meta charset, it adds one, according to doctype, no warning. +* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? +* 3. if it doesn't match the output encoding, and fix. Naybe no warning? +* 4. if there are duplicates, discard them, with warning. +\*/ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) +{ + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; + Bool charsetFound = no; + uint outenc = cfg(doc, TidyOutCharEncoding); + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); + Node *currentNode; + Node *head = TY_(FindHEAD)(doc); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + /* tmbstr httpEquivAttrValue; */ + /* tmbstr lcontent; */ + tmbstr newValue; + Bool add_meta = cfgBool(doc, TidyMetaCharset); + + /* We can't do anything we don't have a head or encoding is NULL */ + if (!head || !enc || !TY_(tmbstrlen)(enc)) + return no; + if (outenc == RAW) + return no; +#ifndef NO_NATIVE_ISO2022_SUPPORT + if (outenc == ISO2022) + return no; +#endif + if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState) + return no; /* nothing to do here if showing body only */ + +#if 0 /* 000000000000000000000000 */ + if (!add_meta) { + TY_(VerifyHTTPEquiv)(doc, head); + return no; + } +#endif /* 000000000000000000000000 */ + + tidyBufInit(&charsetString); + /* Set up the content test 'charset=value' */ + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); + tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ + /* process the children of the head */ + for (currentNode = head->content; currentNode; currentNode = currentNode->next) + { + if (!nodeIsMETA(currentNode)) + continue; /* not a meta node */ + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); + if (!charsetAttr && !httpEquivAttr) + continue; /* has no charset attribute */ + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ + if (charsetAttr && !httpEquivAttr) + { + /* we already found one, so remove the rest. */ + if (charsetFound || !charsetAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + /* Fix mismatched attribute value */ + if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0) + { + newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */ + TY_(tmbstrcpy)(newValue, enc); + /* Note: previously http-equiv had been modified, without warning + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + */ + TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED); + TidyDocFree(doc, charsetAttr->value); /* free current value */ + charsetAttr->value = newValue; + } + /* Make sure it's the first element. */ + if (currentNode != head->content->next) { + TY_(RemoveNode)(currentNode); + TY_(InsertNodeAtStart)(head, currentNode); + } + continue; + } + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ + if (httpEquivAttr && !charsetAttr) + { + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + if (!contentAttr) + continue; /* has no 'content' attribute */ + if (!httpEquivAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */ + if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; /* is not 'content-type' */ + if (!contentAttr->value) + { + /* While this **seems** like a good idea, current tidy accepts this + see reg.test case-1117013.html which contains + so for now. This could be reviewed + in future, since there seem no need to keep this invalid meta */ +#if 0 /* 0000000000000000000000000000000000000000000000000 */ + prevNode = currentNode->prev; + /* maybe need better message here */ + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; +#endif /* 000000000000000000000000000000000000000000000000 */ + continue; /* has no 'content' attribute has NO VALUE! */ + } + /* check encoding matches + If a miss-match found here, fix it. previous silently done + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + lcontent = TY_(tmbstrtolower)(contentAttr->value); + */ + if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0) + { + /* we already found one, so remove the rest. */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + } + else + { + /* fix a mis-match */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + else + { + /* correct the content */ + newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); + TY_(tmbstrcpy)(newValue, "text/html; charset="); + TY_(tmbstrcpy)(newValue + 19, enc); + if (cfgBool(doc, TidyShowMetaChange)) /* Issue #456 - backward compatibility only */ + TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED); + TidyDocFree(doc, contentAttr->value); + contentAttr->value = newValue; + charsetFound = yes; + } + } + continue; + } + /* + 3. + This is generally bad. Discard and warn. + */ + if (httpEquivAttr && charsetAttr) + { + /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ + prevNode = currentNode->prev; + TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + } + + /* completed head scan - add appropriate meta - if 'yes' and none exists */ + if (add_meta && !charsetFound) + { + /* add appropriate meta charset tag - no warning */ + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch (TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)(doc, metaTag, "charset", enc); + break; + default: + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; ", 11); + tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp)); + tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ + TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); /* add 'content=""' */ + tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)(head, metaTag); + TY_(ReportNotice)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */ + } + tidyBufFree(&charsetString); + return yes; +} + void TY_(DropComments)(TidyDocImpl* doc, Node* node) { diff --git a/src/clean.h b/src/clean.h index eb659fd2d..f0dab0f2d 100644 --- a/src/clean.h +++ b/src/clean.h @@ -63,8 +63,11 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html ); #if 0 void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); #endif - +/* Issue #456 - This is discarded */ +#if 0 void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); +#endif +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc); void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); diff --git a/src/config.c b/src/config.c index e4484dc6f..94144910c 100644 --- a/src/config.c +++ b/src/config.c @@ -262,6 +262,7 @@ static const TidyOptionImpl option_defs[] = { TidyMergeDivs, MU, "merge-divs", IN, TidyAutoState, ParsePickList, &autoBoolPicks }, { TidyMergeEmphasis, MU, "merge-emphasis", BL, yes, ParsePickList, &boolPicks }, { TidyMergeSpans, MU, "merge-spans", IN, TidyAutoState, ParsePickList, &autoBoolPicks }, + { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParsePickList, &boolPicks }, /* 20161004 - Issue #456 */ #if SUPPORT_ASIAN_ENCODINGS { TidyNCR, MU, "ncr", BL, yes, ParsePickList, &boolPicks }, #endif @@ -287,6 +288,7 @@ static const TidyOptionImpl option_defs[] = { TidyShowErrors, DG, "show-errors", IN, 6, ParseInt, NULL }, { TidyShowInfo, DG, "show-info", BL, yes, ParsePickList, &boolPicks }, { TidyShowMarkup, PP, "markup", BL, yes, ParsePickList, &boolPicks }, + { TidyShowMetaChange, MS, "show-meta-change", BL, no, ParsePickList, &boolPicks }, /* 20170609 - Issue #456 */ { TidyShowWarnings, DG, "show-warnings", BL, yes, ParsePickList, &boolPicks }, { TidySkipNested, MU, "skip-nested", BL, yes, ParsePickList, &boolPicks }, /* 1642186 - Issue #65 */ { TidySortAttributes, PP, "sort-attributes", IN, TidySortAttrNone,ParsePickList, &sorterPicks }, diff --git a/src/language_en.h b/src/language_en.h index 915d398a0..8aa4c3ffa 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1512,6 +1512,33 @@ static languageDefinition language_en = { whichPluralForm_en, { "This option specifies if Tidy should use the XML parser rather than the " "error correcting HTML parser. " }, + {/* Important notes for translators: + - Use only , , , , and +
. + - Entities, tags, attributes, etc., should be enclosed in . + - Option values should be enclosed in . + - It's very important that
be self-closing! + - The strings "Tidy" and "HTML Tidy" are the program name and must not + be translated. */ + TidyMetaCharset, 0, + "This option, when enabled, adds a <meta> element " + "and sets the charset attribute to the encoding of the " + "document. Set this option to yes to enable it. " + }, + {/* Important notes for translators: + - Use only , , , , and +
. + - Entities, tags, attributes, etc., should be enclosed in . + - Option values should be enclosed in . + - It's very important that
be self-closing! + - The strings "Tidy" and "HTML Tidy" are the program name and must not + be translated. */ + TidyShowMetaChange, 0, + "This option enables a message whenever Tidy changes the " + "content attribute of a meta charset declaration to match " + "the encoding of the document. Set this option to yes to " + "enable it. " + }, {/* Important notes for translators: - Use only , , , , and
. @@ -1519,8 +1546,8 @@ static languageDefinition language_en = { whichPluralForm_en, { - Option values should be enclosed in . - It's very important that
be self-closing! - The strings "Tidy" and "HTML Tidy" are the program name and must not - be translated. */ - TidyStyleTags, 0, + be translated. */ + TidyStyleTags, 0, "This option specifies if Tidy should move all style tags to the " "head of the document. " }, @@ -1788,7 +1815,7 @@ static languageDefinition language_en = { whichPluralForm_en, { "https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md" }, - + /******************************************** ** Report Output ** @remark enum source TidyStrings @@ -1797,6 +1824,7 @@ static languageDefinition language_en = { whichPluralForm_en, { { ANCHOR_NOT_UNIQUE, 0, "%s anchor \"%s\" already defined" }, /* ReportAttrError */ { ATTR_VALUE_NOT_LCASE, 0, "%s attribute value \"%s\" must be lower case for XHTML" }, /* ReportAttrError */ { ATTRIBUTE_IS_NOT_ALLOWED, 0, "%s attribute \"is\" not allowed for autonomous custom tags." }, /* ReportAttrError */ + { ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\", incorrect value \"%s\" replaced" }, /* ReportAttrError/TidyInfo */ { BACKSLASH_IN_URI, 0, "%s URI reference contains backslash. Typo?" }, /* ReportAttrError */ { BAD_ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\" had invalid value \"%s\" and has been replaced" }, /* ReportAttrError */ { BAD_ATTRIBUTE_VALUE, 0, "%s attribute \"%s\" has invalid value \"%s\"" }, /* ReportAttrError */ @@ -1897,6 +1925,8 @@ static languageDefinition language_en = { whichPluralForm_en, { { MOVED_STYLE_TO_HEAD, 0, "moved