diff --git a/include/tidyenum.h b/include/tidyenum.h
index e38a63e71..9b752cc00 100644
--- a/include/tidyenum.h
+++ b/include/tidyenum.h
@@ -169,6 +169,7 @@ extern "C" {
FN(BACKSLASH_IN_URI) \
FN(BAD_ATTRIBUTE_VALUE_REPLACED) \
FN(BAD_ATTRIBUTE_VALUE) \
+ FN(ATTRIBUTE_VALUE_REPLACED) \
FN(ESCAPED_ILLEGAL_URI) \
FN(FIXED_BACKSLASH) \
FN(ID_NAME_MISMATCH) \
@@ -258,14 +259,15 @@ extern "C" {
FN(REMOVED_HTML5) \
FN(XML_DECLARATION_DETECTED) \
/* Report, mixed use */ \
+ FN(ADDED_MISSING_CHARSET) \
FN(COERCE_TO_ENDTAG) \
FN(ELEMENT_NOT_EMPTY) \
+ FN(FOUND_STYLE_IN_BODY) \
+ FN(MOVED_STYLE_TO_HEAD) \
FN(UNEXPECTED_END_OF_FILE) \
FN(UNEXPECTED_ENDTAG) \
- FN(UNEXPECTED_ENDTAG_ERR) \
- FN(MOVED_STYLE_TO_HEAD) \
- FN(FOUND_STYLE_IN_BODY)
-
+ FN(UNEXPECTED_ENDTAG_ERR)
+
/** These are report messages added by Tidy's accessibility module.
** Note that commented out items don't have checks for them at this time,
@@ -589,6 +591,7 @@ typedef enum
TidyMergeDivs, /**< Merge multiple DIVs */
TidyMergeEmphasis, /**< Merge nested B and I elements */
TidyMergeSpans, /**< Merge multiple SPANs */
+ TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
#if SUPPORT_ASIAN_ENCODINGS
TidyNCR, /**< Allow numeric character references */
#else
@@ -620,11 +623,12 @@ typedef enum
TidyShowErrors, /**< Number of errors to put out */
TidyShowInfo, /**< If true, info-level messages are shown */
TidyShowMarkup, /**< If false, normal output is suppressed */
+ TidyShowMetaChange, /**< show when meta http-equiv content charset was changed - compatibility */
TidyShowWarnings, /**< However errors are always shown */
TidySkipNested, /**< Skip nested tags in script and style CDATA */
TidySortAttributes, /**< Sort attributes */
TidyStrictTagsAttr, /**< Ensure tags and attributes match output HTML version */
- TidyStyleTags, /**< Move style to head */
+ TidyStyleTags, /**< Move sytle to head */
TidyTabSize, /**< Expand tabs to n spaces */
TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */
TidyUpperCaseTags, /**< Output tags in upper not lower case */
diff --git a/src/attrs.h b/src/attrs.h
index e5b0fa975..0192efcbd 100644
--- a/src/attrs.h
+++ b/src/attrs.h
@@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
+#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
@@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
+#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
diff --git a/src/clean.c b/src/clean.c
index 7b56f34ac..707e4d90a 100644
--- a/src/clean.c
+++ b/src/clean.c
@@ -2208,6 +2208,9 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
}
#endif
+/* Issue #456 - This is discarded
+ See replacement TidyMetaCharset */
+#if 0 /* 000000000000000000000000 */
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{
Node *pNode;
@@ -2283,6 +2286,222 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
pLastProp = NULL;
}
}
+#endif /* 000000000000000000000000 */
+
+/*\
+* Issue #456 - Check meta charset
+* 1. if there is no meta charset, it adds one, according to doctype, no warning.
+* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
+* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
+* 4. if there are duplicates, discard them, with warning.
+\*/
+Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
+{
+ AttVal *charsetAttr;
+ AttVal *contentAttr;
+ AttVal *httpEquivAttr;
+ Bool charsetFound = no;
+ uint outenc = cfg(doc, TidyOutCharEncoding);
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
+ Node *currentNode;
+ Node *head = TY_(FindHEAD)(doc);
+ Node *metaTag;
+ Node *prevNode;
+ TidyBuffer buf;
+ TidyBuffer charsetString;
+ /* tmbstr httpEquivAttrValue; */
+ /* tmbstr lcontent; */
+ tmbstr newValue;
+ Bool add_meta = cfgBool(doc, TidyMetaCharset);
+
+ /* We can't do anything we don't have a head or encoding is NULL */
+ if (!head || !enc || !TY_(tmbstrlen)(enc))
+ return no;
+ if (outenc == RAW)
+ return no;
+#ifndef NO_NATIVE_ISO2022_SUPPORT
+ if (outenc == ISO2022)
+ return no;
+#endif
+ if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
+ return no; /* nothing to do here if showing body only */
+
+#if 0 /* 000000000000000000000000 */
+ if (!add_meta) {
+ TY_(VerifyHTTPEquiv)(doc, head);
+ return no;
+ }
+#endif /* 000000000000000000000000 */
+
+ tidyBufInit(&charsetString);
+ /* Set up the content test 'charset=value' */
+ tidyBufClear(&charsetString);
+ tidyBufAppend(&charsetString, "charset=", 8);
+ tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
+ tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
+ /* process the children of the head */
+ for (currentNode = head->content; currentNode; currentNode = currentNode->next)
+ {
+ if (!nodeIsMETA(currentNode))
+ continue; /* not a meta node */
+ charsetAttr = attrGetCHARSET(currentNode);
+ httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
+ if (!charsetAttr && !httpEquivAttr)
+ continue; /* has no charset attribute */
+ /*
+ Meta charset comes in quite a few flavors:
+ 1. - expected for (X)HTML5.
+ */
+ if (charsetAttr && !httpEquivAttr)
+ {
+ /* we already found one, so remove the rest. */
+ if (charsetFound || !charsetAttr->value)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ /* Fix mismatched attribute value */
+ if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
+ {
+ newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
+ TY_(tmbstrcpy)(newValue, enc);
+ /* Note: previously http-equiv had been modified, without warning
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ */
+ TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
+ TidyDocFree(doc, charsetAttr->value); /* free current value */
+ charsetAttr->value = newValue;
+ }
+ /* Make sure it's the first element. */
+ if (currentNode != head->content->next) {
+ TY_(RemoveNode)(currentNode);
+ TY_(InsertNodeAtStart)(head, currentNode);
+ }
+ continue;
+ }
+ /*
+ 2.
+ expected for HTML4. This is normally ok - but can clash.
+ */
+ if (httpEquivAttr && !charsetAttr)
+ {
+ contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
+ if (!contentAttr)
+ continue; /* has no 'content' attribute */
+ if (!httpEquivAttr->value)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
+ if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
+ continue; /* is not 'content-type' */
+ if (!contentAttr->value)
+ {
+ /* While this **seems** like a good idea, current tidy accepts this
+ see reg.test case-1117013.html which contains
+ so for now. This could be reviewed
+ in future, since there seem no need to keep this invalid meta */
+#if 0 /* 0000000000000000000000000000000000000000000000000 */
+ prevNode = currentNode->prev;
+ /* maybe need better message here */
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+#endif /* 000000000000000000000000000000000000000000000000 */
+ continue; /* has no 'content' attribute has NO VALUE! */
+ }
+ /* check encoding matches
+ If a miss-match found here, fix it. previous silently done
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
+ lcontent = TY_(tmbstrtolower)(contentAttr->value);
+ */
+ if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
+ {
+ /* we already found one, so remove the rest. */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ continue;
+ }
+ charsetFound = yes;
+ }
+ else
+ {
+ /* fix a mis-match */
+ if (charsetFound)
+ {
+ prevNode = currentNode->prev;
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ }
+ else
+ {
+ /* correct the content */
+ newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
+ TY_(tmbstrcpy)(newValue, "text/html; charset=");
+ TY_(tmbstrcpy)(newValue + 19, enc);
+ if (cfgBool(doc, TidyShowMetaChange)) /* Issue #456 - backward compatibility only */
+ TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
+ TidyDocFree(doc, contentAttr->value);
+ contentAttr->value = newValue;
+ charsetFound = yes;
+ }
+ }
+ continue;
+ }
+ /*
+ 3.
+ This is generally bad. Discard and warn.
+ */
+ if (httpEquivAttr && charsetAttr)
+ {
+ /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
+ prevNode = currentNode->prev;
+ TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
+ TY_(DiscardElement)(doc, currentNode);
+ currentNode = prevNode;
+ }
+ }
+
+ /* completed head scan - add appropriate meta - if 'yes' and none exists */
+ if (add_meta && !charsetFound)
+ {
+ /* add appropriate meta charset tag - no warning */
+ metaTag = TY_(InferredTag)(doc, TidyTag_META);
+ switch (TY_(HTMLVersion)(doc))
+ {
+ case HT50:
+ case XH50:
+ TY_(AddAttribute)(doc, metaTag, "charset", enc);
+ break;
+ default:
+ tidyBufInit(&buf);
+ tidyBufAppend(&buf, "text/html; ", 11);
+ tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
+ tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
+ TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
+ TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); /* add 'content=""' */
+ tidyBufFree(&buf);
+ }
+ TY_(InsertNodeAtStart)(head, metaTag);
+ TY_(ReportNotice)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
+ }
+ tidyBufFree(&charsetString);
+ return yes;
+}
+
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{
diff --git a/src/clean.h b/src/clean.h
index eb659fd2d..f0dab0f2d 100644
--- a/src/clean.h
+++ b/src/clean.h
@@ -63,8 +63,11 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
#if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif
-
+/* Issue #456 - This is discarded */
+#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
+#endif
+Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
diff --git a/src/config.c b/src/config.c
index e4484dc6f..94144910c 100644
--- a/src/config.c
+++ b/src/config.c
@@ -262,6 +262,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyMergeDivs, MU, "merge-divs", IN, TidyAutoState, ParsePickList, &autoBoolPicks },
{ TidyMergeEmphasis, MU, "merge-emphasis", BL, yes, ParsePickList, &boolPicks },
{ TidyMergeSpans, MU, "merge-spans", IN, TidyAutoState, ParsePickList, &autoBoolPicks },
+ { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParsePickList, &boolPicks }, /* 20161004 - Issue #456 */
#if SUPPORT_ASIAN_ENCODINGS
{ TidyNCR, MU, "ncr", BL, yes, ParsePickList, &boolPicks },
#endif
@@ -287,6 +288,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyShowErrors, DG, "show-errors", IN, 6, ParseInt, NULL },
{ TidyShowInfo, DG, "show-info", BL, yes, ParsePickList, &boolPicks },
{ TidyShowMarkup, PP, "markup", BL, yes, ParsePickList, &boolPicks },
+ { TidyShowMetaChange, MS, "show-meta-change", BL, no, ParsePickList, &boolPicks }, /* 20170609 - Issue #456 */
{ TidyShowWarnings, DG, "show-warnings", BL, yes, ParsePickList, &boolPicks },
{ TidySkipNested, MU, "skip-nested", BL, yes, ParsePickList, &boolPicks }, /* 1642186 - Issue #65 */
{ TidySortAttributes, PP, "sort-attributes", IN, TidySortAttrNone,ParsePickList, &sorterPicks },
diff --git a/src/language_en.h b/src/language_en.h
index 915d398a0..8aa4c3ffa 100644
--- a/src/language_en.h
+++ b/src/language_en.h
@@ -1512,6 +1512,33 @@ static languageDefinition language_en = { whichPluralForm_en, {
"This option specifies if Tidy should use the XML parser rather than the "
"error correcting HTML parser. "
},
+ {/* Important notes for translators:
+ - Use only
, , , , and
+
.
+ - Entities, tags, attributes, etc., should be enclosed in
.
+ - Option values should be enclosed in .
+ - It's very important that
be self-closing!
+ - The strings "Tidy" and "HTML Tidy" are the program name and must not
+ be translated. */
+ TidyMetaCharset, 0,
+ "This option, when enabled, adds a <meta>
element "
+ "and sets the charset
attribute to the encoding of the "
+ "document. Set this option to yes to enable it. "
+ },
+ {/* Important notes for translators:
+ - Use only
, , , , and
+
.
+ - Entities, tags, attributes, etc., should be enclosed in
.
+ - Option values should be enclosed in .
+ - It's very important that
be self-closing!
+ - The strings "Tidy" and "HTML Tidy" are the program name and must not
+ be translated. */
+ TidyShowMetaChange, 0,
+ "This option enables a message whenever Tidy changes the "
+ "content
attribute of a meta charset declaration to match "
+ "the encoding of the document. Set this option to yes to "
+ "enable it. "
+ },
{/* Important notes for translators:
- Use only
, , , , and
.
@@ -1519,8 +1546,8 @@ static languageDefinition language_en = { whichPluralForm_en, {
- Option values should be enclosed in .
- It's very important that
be self-closing!
- The strings "Tidy" and "HTML Tidy" are the program name and must not
- be translated. */
- TidyStyleTags, 0,
+ be translated. */
+ TidyStyleTags, 0,
"This option specifies if Tidy should move all style tags to the "
"head of the document. "
},
@@ -1788,7 +1815,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
},
-
+
/********************************************
** Report Output
** @remark enum source TidyStrings
@@ -1797,6 +1824,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ ANCHOR_NOT_UNIQUE, 0, "%s anchor \"%s\" already defined" }, /* ReportAttrError */
{ ATTR_VALUE_NOT_LCASE, 0, "%s attribute value \"%s\" must be lower case for XHTML" }, /* ReportAttrError */
{ ATTRIBUTE_IS_NOT_ALLOWED, 0, "%s attribute \"is\" not allowed for autonomous custom tags." }, /* ReportAttrError */
+ { ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\", incorrect value \"%s\" replaced" }, /* ReportAttrError/TidyInfo */
{ BACKSLASH_IN_URI, 0, "%s URI reference contains backslash. Typo?" }, /* ReportAttrError */
{ BAD_ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\" had invalid value \"%s\" and has been replaced" }, /* ReportAttrError */
{ BAD_ATTRIBUTE_VALUE, 0, "%s attribute \"%s\" has invalid value \"%s\"" }, /* ReportAttrError */
@@ -1897,6 +1925,8 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ MOVED_STYLE_TO_HEAD, 0, "moved