Skip to content

Issue 456 - meta charset option #565

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Aug 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
169bd38
Part 1 - Add basic infra for 'add-meta-charset' option
marcoscaceres Oct 4, 2016
040c22c
Part 2 - Implement lexer logic
marcoscaceres Oct 4, 2016
cfc22ac
Add garvankeeley's suggestions using calloc
marcoscaceres Oct 5, 2016
2d7ddfe
Part 2.1 - Bug fixes and warning
marcoscaceres Oct 5, 2016
b1629c4
fix(lexer): bad attribute reporting
marcoscaceres Oct 5, 2016
53ee94d
fix: incorrect check for first element in head
marcoscaceres Oct 6, 2016
932cc10
feat(attrask.c): learn about charset attr
marcoscaceres Oct 6, 2016
523d58b
refactor: ask for charset and http_equiv attrs
marcoscaceres Oct 6, 2016
aff76be
fix(lexer.c): fixes from initial review
marcoscaceres Oct 17, 2016
8843199
Issue #456 - Merge branch 'meta-charset' of tidy-html5-marco.
geoffmcl May 13, 2017
6ebd12b
Issue #456 - More work on this option
geoffmcl May 14, 2017
f310f1d
Issue #456 - Move new TidyMetaCharset to clean
geoffmcl May 15, 2017
a7a4cd6
Issue #456 - avoid head work if showing body only
geoffmcl May 15, 2017
21f0085
Issue #456 - Oops, also out of 'lexer.h'
geoffmcl May 15, 2017
049bc6c
mERGE branch 'next' into issue-456
geoffmcl May 27, 2017
8a932f9
Issue #456 - Oops, incorrect merge conflict
geoffmcl May 27, 2017
40e1d64
Issue #456 - A desparate commit to get this WIP right, but...
geoffmcl May 27, 2017
722a841
Merge branch 'next' into issue-456
geoffmcl May 29, 2017
e28ec72
Merge branch 'next' into issue-456
geoffmcl Jun 4, 2017
13b34c9
Issue #456 - BAH! Fix a stupid logic reversal
geoffmcl Jun 4, 2017
a4770da
Issue #456 - Add 'Info:' message, when meta added.
geoffmcl Jun 4, 2017
9729264
Issue #456 - Add 'Info:' message when charset replaced
geoffmcl Jun 5, 2017
b32e14a
Issue #456 - add new option `show-meta-change`
geoffmcl Jun 9, 2017
1562c42
Merge branch 'next' into issue-456
balthisar Aug 28, 2017
e5a05ae
Address merge conflicts.
balthisar Aug 31, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions include/tidyenum.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ extern "C" {
FN(BACKSLASH_IN_URI) \
FN(BAD_ATTRIBUTE_VALUE_REPLACED) \
FN(BAD_ATTRIBUTE_VALUE) \
FN(ATTRIBUTE_VALUE_REPLACED) \
FN(ESCAPED_ILLEGAL_URI) \
FN(FIXED_BACKSLASH) \
FN(ID_NAME_MISMATCH) \
Expand Down Expand Up @@ -258,14 +259,15 @@ extern "C" {
FN(REMOVED_HTML5) \
FN(XML_DECLARATION_DETECTED) \
/* Report, mixed use */ \
FN(ADDED_MISSING_CHARSET) \
FN(COERCE_TO_ENDTAG) \
FN(ELEMENT_NOT_EMPTY) \
FN(FOUND_STYLE_IN_BODY) \
FN(MOVED_STYLE_TO_HEAD) \
FN(UNEXPECTED_END_OF_FILE) \
FN(UNEXPECTED_ENDTAG) \
FN(UNEXPECTED_ENDTAG_ERR) \
FN(MOVED_STYLE_TO_HEAD) \
FN(FOUND_STYLE_IN_BODY)

FN(UNEXPECTED_ENDTAG_ERR)


/** These are report messages added by Tidy's accessibility module.
** Note that commented out items don't have checks for them at this time,
Expand Down Expand Up @@ -589,6 +591,7 @@ typedef enum
TidyMergeDivs, /**< Merge multiple DIVs */
TidyMergeEmphasis, /**< Merge nested B and I elements */
TidyMergeSpans, /**< Merge multiple SPANs */
TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */
#if SUPPORT_ASIAN_ENCODINGS
TidyNCR, /**< Allow numeric character references */
#else
Expand Down Expand Up @@ -620,11 +623,12 @@ typedef enum
TidyShowErrors, /**< Number of errors to put out */
TidyShowInfo, /**< If true, info-level messages are shown */
TidyShowMarkup, /**< If false, normal output is suppressed */
TidyShowMetaChange, /**< show when meta http-equiv content charset was changed - compatibility */
TidyShowWarnings, /**< However errors are always shown */
TidySkipNested, /**< Skip nested tags in script and style CDATA */
TidySortAttributes, /**< Sort attributes */
TidyStrictTagsAttr, /**< Ensure tags and attributes match output HTML version */
TidyStyleTags, /**< Move style to head */
TidyStyleTags, /**< Move sytle to head */
TidyTabSize, /**< Expand tabs to n spaces */
TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */
TidyUpperCaseTags, /**< Output tags in upper not lower case */
Expand Down
2 changes: 2 additions & 0 deletions src/attrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
Expand Down Expand Up @@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET )
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
Expand Down
219 changes: 219 additions & 0 deletions src/clean.c
Original file line number Diff line number Diff line change
Expand Up @@ -2208,6 +2208,9 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
}
#endif

/* Issue #456 - This is discarded
See replacement TidyMetaCharset */
#if 0 /* 000000000000000000000000 */
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
{
Node *pNode;
Expand Down Expand Up @@ -2283,6 +2286,222 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
pLastProp = NULL;
}
}
#endif /* 000000000000000000000000 */

/*\
* Issue #456 - Check meta charset
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
* 4. if there are duplicates, discard them, with warning.
\*/
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
{
AttVal *charsetAttr;
AttVal *contentAttr;
AttVal *httpEquivAttr;
Bool charsetFound = no;
uint outenc = cfg(doc, TidyOutCharEncoding);
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
Node *currentNode;
Node *head = TY_(FindHEAD)(doc);
Node *metaTag;
Node *prevNode;
TidyBuffer buf;
TidyBuffer charsetString;
/* tmbstr httpEquivAttrValue; */
/* tmbstr lcontent; */
tmbstr newValue;
Bool add_meta = cfgBool(doc, TidyMetaCharset);

/* We can't do anything we don't have a head or encoding is NULL */
if (!head || !enc || !TY_(tmbstrlen)(enc))
return no;
if (outenc == RAW)
return no;
#ifndef NO_NATIVE_ISO2022_SUPPORT
if (outenc == ISO2022)
return no;
#endif
if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
return no; /* nothing to do here if showing body only */

#if 0 /* 000000000000000000000000 */
if (!add_meta) {
TY_(VerifyHTTPEquiv)(doc, head);
return no;
}
#endif /* 000000000000000000000000 */

tidyBufInit(&charsetString);
/* Set up the content test 'charset=value' */
tidyBufClear(&charsetString);
tidyBufAppend(&charsetString, "charset=", 8);
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
/* process the children of the head */
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
{
if (!nodeIsMETA(currentNode))
continue; /* not a meta node */
charsetAttr = attrGetCHARSET(currentNode);
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
if (!charsetAttr && !httpEquivAttr)
continue; /* has no charset attribute */
/*
Meta charset comes in quite a few flavors:
1. <meta charset="value"> - expected for (X)HTML5.
*/
if (charsetAttr && !httpEquivAttr)
{
/* we already found one, so remove the rest. */
if (charsetFound || !charsetAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
/* Fix mismatched attribute value */
if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
{
newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
TY_(tmbstrcpy)(newValue, enc);
/* Note: previously http-equiv had been modified, without warning
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
*/
TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
TidyDocFree(doc, charsetAttr->value); /* free current value */
charsetAttr->value = newValue;
}
/* Make sure it's the first element. */
if (currentNode != head->content->next) {
TY_(RemoveNode)(currentNode);
TY_(InsertNodeAtStart)(head, currentNode);
}
continue;
}
/*
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
expected for HTML4. This is normally ok - but can clash.
*/
if (httpEquivAttr && !charsetAttr)
{
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
if (!contentAttr)
continue; /* has no 'content' attribute */
if (!httpEquivAttr->value)
{
prevNode = currentNode->prev;
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
/* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
continue; /* is not 'content-type' */
if (!contentAttr->value)
{
/* While this **seems** like a good idea, current tidy accepts this
see reg.test case-1117013.html which contains
<META HTTP-EQUIV="Content-Type" CONTENT=""> so for now. This could be reviewed
in future, since there seem no need to keep this invalid meta */
#if 0 /* 0000000000000000000000000000000000000000000000000 */
prevNode = currentNode->prev;
/* maybe need better message here */
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
#endif /* 000000000000000000000000000000000000000000000000 */
continue; /* has no 'content' attribute has NO VALUE! */
}
/* check encoding matches
If a miss-match found here, fix it. previous silently done
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
lcontent = TY_(tmbstrtolower)(contentAttr->value);
*/
if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
{
/* we already found one, so remove the rest. */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
continue;
}
charsetFound = yes;
}
else
{
/* fix a mis-match */
if (charsetFound)
{
prevNode = currentNode->prev;
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
else
{
/* correct the content */
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
TY_(tmbstrcpy)(newValue, "text/html; charset=");
TY_(tmbstrcpy)(newValue + 19, enc);
if (cfgBool(doc, TidyShowMetaChange)) /* Issue #456 - backward compatibility only */
TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
TidyDocFree(doc, contentAttr->value);
contentAttr->value = newValue;
charsetFound = yes;
}
}
continue;
}
/*
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
This is generally bad. Discard and warn.
*/
if (httpEquivAttr && charsetAttr)
{
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
prevNode = currentNode->prev;
TY_(ReportNotice)(doc, head, currentNode, DISCARDING_UNEXPECTED);
TY_(DiscardElement)(doc, currentNode);
currentNode = prevNode;
}
}

/* completed head scan - add appropriate meta - if 'yes' and none exists */
if (add_meta && !charsetFound)
{
/* add appropriate meta charset tag - no warning */
metaTag = TY_(InferredTag)(doc, TidyTag_META);
switch (TY_(HTMLVersion)(doc))
{
case HT50:
case XH50:
TY_(AddAttribute)(doc, metaTag, "charset", enc);
break;
default:
tidyBufInit(&buf);
tidyBufAppend(&buf, "text/html; ", 11);
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); /* add 'content="<enc>"' */
tidyBufFree(&buf);
}
TY_(InsertNodeAtStart)(head, metaTag);
TY_(ReportNotice)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
}
tidyBufFree(&charsetString);
return yes;
}


void TY_(DropComments)(TidyDocImpl* doc, Node* node)
{
Expand Down
5 changes: 4 additions & 1 deletion src/clean.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,11 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
#if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif

/* Issue #456 - This is discarded */
#if 0
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
#endif
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);

void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
Expand Down
2 changes: 2 additions & 0 deletions src/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyMergeDivs, MU, "merge-divs", IN, TidyAutoState, ParsePickList, &autoBoolPicks },
{ TidyMergeEmphasis, MU, "merge-emphasis", BL, yes, ParsePickList, &boolPicks },
{ TidyMergeSpans, MU, "merge-spans", IN, TidyAutoState, ParsePickList, &autoBoolPicks },
{ TidyMetaCharset, MS, "add-meta-charset", BL, no, ParsePickList, &boolPicks }, /* 20161004 - Issue #456 */
#if SUPPORT_ASIAN_ENCODINGS
{ TidyNCR, MU, "ncr", BL, yes, ParsePickList, &boolPicks },
#endif
Expand All @@ -287,6 +288,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyShowErrors, DG, "show-errors", IN, 6, ParseInt, NULL },
{ TidyShowInfo, DG, "show-info", BL, yes, ParsePickList, &boolPicks },
{ TidyShowMarkup, PP, "markup", BL, yes, ParsePickList, &boolPicks },
{ TidyShowMetaChange, MS, "show-meta-change", BL, no, ParsePickList, &boolPicks }, /* 20170609 - Issue #456 */
{ TidyShowWarnings, DG, "show-warnings", BL, yes, ParsePickList, &boolPicks },
{ TidySkipNested, MU, "skip-nested", BL, yes, ParsePickList, &boolPicks }, /* 1642186 - Issue #65 */
{ TidySortAttributes, PP, "sort-attributes", IN, TidySortAttrNone,ParsePickList, &sorterPicks },
Expand Down
36 changes: 33 additions & 3 deletions src/language_en.h
Original file line number Diff line number Diff line change
Expand Up @@ -1512,15 +1512,42 @@ static languageDefinition language_en = { whichPluralForm_en, {
"This option specifies if Tidy should use the XML parser rather than the "
"error correcting HTML parser. "
},
{/* Important notes for translators:
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
<br/>.
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
- Option values should be enclosed in <var></var>.
- It's very important that <br/> be self-closing!
- The strings "Tidy" and "HTML Tidy" are the program name and must not
be translated. */
TidyMetaCharset, 0,
"This option, when enabled, adds a <code>&lt;meta&gt;</code> element "
"and sets the <code>charset</code> attribute to the encoding of the "
"document. Set this option to <var>yes</var> to enable it. "
},
{/* Important notes for translators:
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
<br/>.
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
- Option values should be enclosed in <var></var>.
- It's very important that <br/> be self-closing!
- The strings "Tidy" and "HTML Tidy" are the program name and must not
be translated. */
TidyShowMetaChange, 0,
"This option enables a message whenever Tidy changes the "
"<code>content</code> attribute of a meta charset declaration to match "
"the encoding of the document. Set this option to <var>yes</var> to "
"enable it. "
},
{/* Important notes for translators:
- Use only <code></code>, <var></var>, <em></em>, <strong></strong>, and
<br/>.
- Entities, tags, attributes, etc., should be enclosed in <code></code>.
- Option values should be enclosed in <var></var>.
- It's very important that <br/> be self-closing!
- The strings "Tidy" and "HTML Tidy" are the program name and must not
be translated. */
TidyStyleTags, 0,
be translated. */
TidyStyleTags, 0,
"This option specifies if Tidy should move all style tags to the "
"head of the document. "
},
Expand Down Expand Up @@ -1788,7 +1815,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md"
},


/********************************************
** Report Output
** @remark enum source TidyStrings
Expand All @@ -1797,6 +1824,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ ANCHOR_NOT_UNIQUE, 0, "%s anchor \"%s\" already defined" }, /* ReportAttrError */
{ ATTR_VALUE_NOT_LCASE, 0, "%s attribute value \"%s\" must be lower case for XHTML" }, /* ReportAttrError */
{ ATTRIBUTE_IS_NOT_ALLOWED, 0, "%s attribute \"is\" not allowed for autonomous custom tags." }, /* ReportAttrError */
{ ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\", incorrect value \"%s\" replaced" }, /* ReportAttrError/TidyInfo */
{ BACKSLASH_IN_URI, 0, "%s URI reference contains backslash. Typo?" }, /* ReportAttrError */
{ BAD_ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\" had invalid value \"%s\" and has been replaced" }, /* ReportAttrError */
{ BAD_ATTRIBUTE_VALUE, 0, "%s attribute \"%s\" has invalid value \"%s\"" }, /* ReportAttrError */
Expand Down Expand Up @@ -1897,6 +1925,8 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ MOVED_STYLE_TO_HEAD, 0, "moved <style> tag to <head>! fix-style-tags: no to avoid." }, /* ReportWarning */
{ FOUND_STYLE_IN_BODY, 0, "found <style> tag in <body>! fix-style-tags: yes to move." }, /* ReportWarning */

{ ADDED_MISSING_CHARSET, 0, "Added appropriate missing <meta charset=...> to %s" }, /* ReportInfo */

#if SUPPORT_ACCESSIBILITY_CHECKS

/***************************************
Expand Down
Loading