From 169bd38adf77f4ebf07dd10d42225e7972c38757 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Tue, 4 Oct 2016 14:29:06 +1100 Subject: [PATCH 01/19] Part 1 - Add basic infra for 'add-meta-charset' option --- include/tidyenum.h | 1 + src/config.c | 1 + src/language_en.h | 12 ++++++++++++ src/lexer.c | 10 ++++++++++ src/lexer.h | 3 +++ src/tidylib.c | 4 ++++ 6 files changed, 31 insertions(+) diff --git a/include/tidyenum.h b/include/tidyenum.h index f494afbc3..d3c34f3d0 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -172,6 +172,7 @@ typedef enum TidySkipNested, /**< Skip nested tags in script and style CDATA */ TidyStrictTagsAttr, /**< Ensure tags and attributes match output HTML version */ TidyEscapeScripts, /**< Escape items that look like closing tags in script tags */ + TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */ N_TIDY_OPTIONS /**< Must be last */ } TidyOptionId; diff --git a/src/config.c b/src/config.c index ddb677c28..f040c965c 100644 --- a/src/config.c +++ b/src/config.c @@ -324,6 +324,7 @@ static const TidyOptionImpl option_defs[] = { TidySkipNested, MU, "skip-nested", BL, yes, ParseBool, boolPicks }, /* 1642186 - Issue #65 */ { TidyStrictTagsAttr, MU, "strict-tags-attributes", BL, no, ParseBool, boolPicks }, /* 20160209 - Issue #350 */ { TidyEscapeScripts, PP, "escape-scripts", BL, yes, ParseBool, boolPicks }, /* 20160227 - Issue #348 */ + { TidyMetaCharset, MS, "add-meta-charset", BL, yes, ParseBool, boolPicks }, /* 20161004 - Issue #456 */ { N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL } }; diff --git a/src/language_en.h b/src/language_en.h index 7a8d1de68..316ec09b0 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -2080,6 +2080,18 @@ static languageDefinition language_en = { whichPluralForm_en, { "This option causes items that look like closing tags, like </g to be escaped " "to <\\/g. Set this option to 'no' if you do not want this." }, + {/* Important notes for translators: + - Use only , , , , and +
. + - Entities, tags, attributes, etc., should be enclosed in . + - Option values should be enclosed in . + - It's very important that
be self-closing! + - The strings "Tidy" and "HTML Tidy" are the program name and must not + be translated. */ + TidyMetaCharset, 0, + "This option adds a meta element and sets the charset attribute to the encoding of the document." + "Set this option to 'yes' if you want this." + }, /******************************************************** ** Console Application diff --git a/src/lexer.c b/src/lexer.c index 6c220850f..ffc439493 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1674,6 +1674,16 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } +/* Check meta charset*/ +Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ) +{ + AttVal *attval; + Node *node; + Node *head = TY_(FindHEAD)( doc ); + printf("hello"); + return no; +} + /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ) { diff --git a/src/lexer.h b/src/lexer.h index 0c8d5bbf1..e390e7a2d 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -491,6 +491,9 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc); /* Returns containing block element, if any */ Node* TY_(FindContainer)( Node* node ); +/* Adds meta element and sets the charset */ +Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ); + /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ); diff --git a/src/tidylib.c b/src/tidylib.c index 4787336b1..4753ab132 100755 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -1795,6 +1795,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut ); Bool xmlDecl = cfgBool( doc, TidyXmlDecl ); Bool tidyMark = cfgBool( doc, TidyMark ); + Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset); Bool tidyXmlTags = cfgBool( doc, TidyXmlTags ); Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName ); Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis ); @@ -1898,6 +1899,9 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) if (tidyMark ) TY_(AddGenerator)(doc); + + if (tidyMetaCharset) + TY_(TidyMetaCharset)(doc); } /* ensure presence of initial */ From 040c22c6dc26a81d30832ebcc5be91baf983dd49 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Tue, 4 Oct 2016 16:13:05 +1100 Subject: [PATCH 02/19] Part 2 - Implement lexer logic --- src/lexer.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index ffc439493..2b73604db 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1674,14 +1674,98 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/* Check meta charset*/ -Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ) +/* Check meta charset + 1. if there is no meta charset, it adds one. + 2. if there is a meta charset, it moves it to the top if HEAD. + 3. if it doesn't match the output encoding, warn about that. + 4. if there are duplicates, discard them. + */ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { - AttVal *attval; - Node *node; Node *head = TY_(FindHEAD)( doc ); - printf("hello"); - return no; + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + Bool charsetFound = no; + // We can't do anything we don't have a head or encoding is NULL + if( !head || !enc ) + return no; + + for (Node *node = head->content; node; node = node->next) + { + if (!nodeIsMETA(node)) + continue; + AttVal *charsetAttr = TY_(AttrGetById)(node, TidyAttr_CHARSET); + AttVal *httpEquivAttr = TY_(AttrGetById)(node, TidyAttr_HTTP_EQUIV); + if(!charsetAttr && !httpEquivAttr) + continue; + + // Meta charset comes in quite a few flavors: + // 1. - expected for (X)HTML5. + if (charsetAttr && !httpEquivAttr) + { + // we already found one + if(charsetFound) + { + TY_(DiscardElement)( doc, node ); + printf("WARNING ABOUT DISCARDING ELEMENT \n"); + continue; + } + charsetFound = yes; + tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value); + if(strcmp(lCharset, enc) == 0) + { + // Move it to head + TY_(RemoveNode)( node ); + TY_(InsertNodeAtStart)( head, node ); + } + else + { + printf("WARN ABOUT MISMATCH: %s not match output %s \n", lCharset, enc); + TY_(RemoveNode)( node ); + TY_(InsertNodeAtStart)( head, node ); + } + continue; + } + + // 2. + // expected for HTML4. This is normally ok - but can clash. + if(httpEquivAttr && !charsetAttr) + { + AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); + tmbstr lvalue = TY_(tmbstrtolower)(httpEquivAttr->value); + if(!contentAttr || strcmp(lvalue, "content-type") != 0) + continue; + tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); + char expected[sizeof(enc) + 8] = "charset="; + strcat(expected, enc); + if(TY_(tmbsubstr)(lcontent, expected)){ + printf("WARN ABOUT CLASH: %s \n", contentAttr->value); + } + } + // 3. + // This is generally bad. + if(httpEquivAttr && charsetAttr) + { + printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); + } + } + if(charsetFound){ + return yes; + } + Node *node = TY_(InferredTag)(doc, TidyTag_META); + switch(TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)( doc, node, "charset", enc); + break; + default: + TY_(AddAttribute)( doc, node, "http-equiv", "content-type"); + TY_(AddAttribute)( doc, node, "content", "text/html; charset="); + AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); + TY_(tmbstrcat)(contentAttr->value, enc); + } + TY_(InsertNodeAtStart)( head, node ); + return yes; } /* add meta element for Tidy */ From cfc22ac46e4876170481a882ae3efa2641bbfb20 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Wed, 5 Oct 2016 18:54:25 +1100 Subject: [PATCH 03/19] Add garvankeeley's suggestions using calloc --- src/lexer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lexer.c b/src/lexer.c index 2b73604db..568ac36eb 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1735,11 +1735,14 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if(!contentAttr || strcmp(lvalue, "content-type") != 0) continue; tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); - char expected[sizeof(enc) + 8] = "charset="; + char* charsetString = "charset="; + char* expected = calloc(strlen(enc) + strlen(charsetString) + 1, sizeof(char*)); + strcat(expected, charsetString); strcat(expected, enc); if(TY_(tmbsubstr)(lcontent, expected)){ printf("WARN ABOUT CLASH: %s \n", contentAttr->value); } + free(expected); } // 3. // This is generally bad. From 2d7ddfef94eacc5f100cdc55887b96b17e746b95 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Wed, 5 Oct 2016 20:14:18 +1100 Subject: [PATCH 04/19] Part 2.1 - Bug fixes and warning --- src/lexer.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 568ac36eb..882522cd6 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1697,29 +1697,31 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) AttVal *httpEquivAttr = TY_(AttrGetById)(node, TidyAttr_HTTP_EQUIV); if(!charsetAttr && !httpEquivAttr) continue; - // Meta charset comes in quite a few flavors: // 1. - expected for (X)HTML5. if (charsetAttr && !httpEquivAttr) { - // we already found one + // we already found one, so remove the rest. if(charsetFound) { + Node *prevNode = node->prev; + TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); TY_(DiscardElement)( doc, node ); - printf("WARNING ABOUT DISCARDING ELEMENT \n"); + node = prevNode; continue; } charsetFound = yes; tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value); - if(strcmp(lCharset, enc) == 0) + // Fix mismatched attribute value + if(strcmp(lCharset, enc) != 0) { - // Move it to head - TY_(RemoveNode)( node ); - TY_(InsertNodeAtStart)( head, node ); + tmbstr newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + TY_(tmbstrcpy)( newValue, enc ); + charsetAttr->value = newValue; + TY_(ReportError)( doc, head, node, BAD_ATTRIBUTE_VALUE_REPLACED ); } - else - { - printf("WARN ABOUT MISMATCH: %s not match output %s \n", lCharset, enc); + // Make sure it's the first element. + if ( node != head->next ){ TY_(RemoveNode)( node ); TY_(InsertNodeAtStart)( head, node ); } From b1629c4a4f5f6eb1462048aa74e793e045b92ed1 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Wed, 5 Oct 2016 20:22:19 +1100 Subject: [PATCH 05/19] fix(lexer): bad attribute reporting --- src/lexer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lexer.c b/src/lexer.c index 882522cd6..91e053516 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1717,8 +1717,8 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { tmbstr newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); TY_(tmbstrcpy)( newValue, enc ); + TY_(ReportAttrError)( doc, node, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); charsetAttr->value = newValue; - TY_(ReportError)( doc, head, node, BAD_ATTRIBUTE_VALUE_REPLACED ); } // Make sure it's the first element. if ( node != head->next ){ From 53ee94ddbaabb5e222eac68dddc6492fbcf7a6e3 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Thu, 6 Oct 2016 19:07:44 +1100 Subject: [PATCH 06/19] fix: incorrect check for first element in head --- src/lexer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lexer.c b/src/lexer.c index 91e053516..24dced8b7 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1721,7 +1721,7 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) charsetAttr->value = newValue; } // Make sure it's the first element. - if ( node != head->next ){ + if ( node != head->content->next ){ TY_(RemoveNode)( node ); TY_(InsertNodeAtStart)( head, node ); } From 932cc104a676a8d0179830e7bf68f1e53a1d3ff3 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Thu, 6 Oct 2016 19:29:56 +1100 Subject: [PATCH 07/19] feat(attrask.c): learn about charset attr --- src/attrask.c | 4 ++++ src/attrget.c | 4 ++++ src/attrs.h | 2 ++ 3 files changed, 10 insertions(+) diff --git a/src/attrask.c b/src/attrask.c index 92c75d32f..052823363 100644 --- a/src/attrask.c +++ b/src/attrask.c @@ -193,6 +193,10 @@ Bool TIDY_CALL tidyAttrIsROWSPAN( TidyAttr tattr ) { return attrIsROWSPAN( tidyAttrToImpl(tattr) ); } +Bool TIDY_CALL tidyAttrIsCHARSET( TidyAttr tattr ) +{ + return attrIsCHARSET( tidyAttrToImpl(tattr) ); +} /* * local variables: diff --git a/src/attrget.c b/src/attrget.c index 6562cc2b2..428620fd7 100644 --- a/src/attrget.c +++ b/src/attrget.c @@ -197,6 +197,10 @@ TidyAttr TIDY_CALL tidyAttrGetROWSPAN( TidyNode tnod ) { return tidyImplToAttr( attrGetROWSPAN( tidyNodeToImpl(tnod) ) ); } +TidyAttr TIDY_CALL tidyAttrGetCHARSET( TidyNode tnod ) +{ + return tidyImplToAttr( attrGetCHARSET( tidyNodeToImpl(tnod) ) ); +} /* * local variables: diff --git a/src/attrs.h b/src/attrs.h index e5b0fa975..0192efcbd 100644 --- a/src/attrs.h +++ b/src/attrs.h @@ -184,6 +184,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN ) #define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING ) #define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING ) +#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) #define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR ) #define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF ) #define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET ) @@ -385,6 +386,7 @@ Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc); #define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT ) #define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR ) #define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED ) +#define attrGetCHARSET( nod ) TY_(AttrGetById)( nod, TidyAttr_CHARSET ) #define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED ) #define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG ) #define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET ) From 523d58b00448453040b4ad13d0b864f437b30da1 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Thu, 6 Oct 2016 19:30:23 +1100 Subject: [PATCH 08/19] refactor: ask for charset and http_equiv attrs --- src/lexer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 24dced8b7..0a48e5371 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1693,8 +1693,8 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { if (!nodeIsMETA(node)) continue; - AttVal *charsetAttr = TY_(AttrGetById)(node, TidyAttr_CHARSET); - AttVal *httpEquivAttr = TY_(AttrGetById)(node, TidyAttr_HTTP_EQUIV); + AttVal *charsetAttr = attrGetCHARSET(node); + AttVal *httpEquivAttr = attrGetHTTP_EQUIV(node); if(!charsetAttr && !httpEquivAttr) continue; // Meta charset comes in quite a few flavors: From aff76bec380ccb22701f772e9fba3f68bb414fb5 Mon Sep 17 00:00:00 2001 From: Marcos Caceres Date: Mon, 17 Oct 2016 17:00:58 +1100 Subject: [PATCH 09/19] fix(lexer.c): fixes from initial review --- src/lexer.c | 102 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 0a48e5371..7f3d683f2 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1682,72 +1682,86 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) */ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { - Node *head = TY_(FindHEAD)( doc ); - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; Bool charsetFound = no; - // We can't do anything we don't have a head or encoding is NULL + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + Node *currentNode; + Node *head = TY_(FindHEAD)( doc ); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + tmbstr httpEquivAttrValue; + tmbstr lcontent; + tmbstr newValue; + /* We can't do anything we don't have a head or encoding is NULL */ if( !head || !enc ) return no; - - for (Node *node = head->content; node; node = node->next) + tidyBufInit(&charsetString); + for (currentNode = head->content; currentNode; currentNode = currentNode->next) { - if (!nodeIsMETA(node)) + if (!nodeIsMETA(currentNode)) continue; - AttVal *charsetAttr = attrGetCHARSET(node); - AttVal *httpEquivAttr = attrGetHTTP_EQUIV(node); + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); if(!charsetAttr && !httpEquivAttr) continue; - // Meta charset comes in quite a few flavors: - // 1. - expected for (X)HTML5. + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ if (charsetAttr && !httpEquivAttr) { // we already found one, so remove the rest. if(charsetFound) { - Node *prevNode = node->prev; - TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED); - TY_(DiscardElement)( doc, node ); - node = prevNode; + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)( doc, currentNode ); + currentNode = prevNode; continue; } charsetFound = yes; - tmbstr lCharset = TY_(tmbstrtolower)(charsetAttr->value); // Fix mismatched attribute value - if(strcmp(lCharset, enc) != 0) + if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) { - tmbstr newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); TY_(tmbstrcpy)( newValue, enc ); - TY_(ReportAttrError)( doc, node, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); charsetAttr->value = newValue; } // Make sure it's the first element. - if ( node != head->content->next ){ - TY_(RemoveNode)( node ); - TY_(InsertNodeAtStart)( head, node ); + if ( currentNode != head->content->next ){ + TY_(RemoveNode)( currentNode ); + TY_(InsertNodeAtStart)( head, currentNode ); } continue; } - - // 2. - // expected for HTML4. This is normally ok - but can clash. + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ if(httpEquivAttr && !charsetAttr) { - AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); - tmbstr lvalue = TY_(tmbstrtolower)(httpEquivAttr->value); - if(!contentAttr || strcmp(lvalue, "content-type") != 0) + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc )); + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + + if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) continue; - tmbstr lcontent = TY_(tmbstrtolower)(contentAttr->value); - char* charsetString = "charset="; - char* expected = calloc(strlen(enc) + strlen(charsetString) + 1, sizeof(char*)); - strcat(expected, charsetString); - strcat(expected, enc); - if(TY_(tmbsubstr)(lcontent, expected)){ + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){ printf("WARN ABOUT CLASH: %s \n", contentAttr->value); } - free(expected); } - // 3. - // This is generally bad. + /* + 3. + This is generally bad. + */ if(httpEquivAttr && charsetAttr) { printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); @@ -1756,20 +1770,22 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if(charsetFound){ return yes; } - Node *node = TY_(InferredTag)(doc, TidyTag_META); + metaTag = TY_(InferredTag)(doc, TidyTag_META); switch(TY_(HTMLVersion)(doc)) { case HT50: case XH50: - TY_(AddAttribute)( doc, node, "charset", enc); + TY_(AddAttribute)( doc, metaTag, "charset", enc); break; default: - TY_(AddAttribute)( doc, node, "http-equiv", "content-type"); - TY_(AddAttribute)( doc, node, "content", "text/html; charset="); - AttVal *contentAttr = TY_(AttrGetById)(node, TidyAttr_CONTENT); - TY_(tmbstrcat)(contentAttr->value, enc); + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; charset=", 19); + tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); + TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); + tidyBufFree(&buf); } - TY_(InsertNodeAtStart)( head, node ); + TY_(InsertNodeAtStart)( head, metaTag ); + tidyBufFree(&charsetString); return yes; } From 6ebd12be67101df684ce468474619a5ef15a728b Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sun, 14 May 2017 19:08:29 +0200 Subject: [PATCH 10/19] Issue #456 - More work on this option --- src/clean.c | 3 + src/clean.h | 4 +- src/lexer.c | 156 +++++++++++++++++++++++++++++++++++++------------- src/tidylib.c | 10 +--- 4 files changed, 124 insertions(+), 49 deletions(-) diff --git a/src/clean.c b/src/clean.c index 779ddecbb..8db77c7b3 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) } #endif +/* Issue #456 - This is discarded */ +#if 0 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { Node *pNode; @@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) pLastProp = NULL; } } +#endif void TY_(DropComments)(TidyDocImpl* doc, Node* node) { diff --git a/src/clean.h b/src/clean.h index 00d4923ec..d5d4117be 100644 --- a/src/clean.h +++ b/src/clean.h @@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html ); #if 0 void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); #endif - +/* Issue #456 - This is discarded */ +#if 0 void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); +#endif void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); diff --git a/src/lexer.c b/src/lexer.c index c2773dc06..b3832d965 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/* Check meta charset - 1. if there is no meta charset, it adds one. - 2. if there is a meta charset, it moves it to the top if HEAD. - 3. if it doesn't match the output encoding, warn about that. - 4. if there are duplicates, discard them. - */ +/*\ + * Issue #456 - Check meta charset + * 1. if there is no meta charset, it adds one, according to doctype, no warning. + * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? + * 3. if it doesn't match the output encoding, and fix. Naybe no warning? + * 4. if there are duplicates, discard them, with warning. +\*/ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { AttVal *charsetAttr; AttVal *contentAttr; AttVal *httpEquivAttr; Bool charsetFound = no; - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); + uint outenc = cfg(doc, TidyOutCharEncoding); + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); Node *currentNode; Node *head = TY_(FindHEAD)( doc ); Node *metaTag; @@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) tmbstr lcontent; tmbstr newValue; /* We can't do anything we don't have a head or encoding is NULL */ - if( !head || !enc ) + if( !head || !enc || !TY_(tmbstrlen)(enc)) + return no; + if (outenc == RAW) return no; +#ifndef NO_NATIVE_ISO2022_SUPPORT + if (outenc == ISO2022) + return no; +#endif + tidyBufInit(&charsetString); + /* Set up the content test 'charset=value' */ + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); + tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ + /* process the children of the head */ for (currentNode = head->content; currentNode; currentNode = currentNode->next) { if (!nodeIsMETA(currentNode)) - continue; + continue; /* not a meta node */ charsetAttr = attrGetCHARSET(currentNode); httpEquivAttr = attrGetHTTP_EQUIV(currentNode); if(!charsetAttr && !httpEquivAttr) - continue; + continue; /* has no charset attribute */ /* Meta charset comes in quite a few flavors: - 1. - expected for (X)HTML5. + 1. - expected for (X)HTML5. */ if (charsetAttr && !httpEquivAttr) { - // we already found one, so remove the rest. - if(charsetFound) + /* we already found one, so remove the rest. */ + if(charsetFound || !charsetAttr->value) { prevNode = currentNode->prev; TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); @@ -1877,15 +1892,19 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) continue; } charsetFound = yes; - // Fix mismatched attribute value + /* Fix mismatched attribute value */ if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) { - newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) ); + newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */ TY_(tmbstrcpy)( newValue, enc ); - TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + /* Note: previously http-equiv had been modified, without warning + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + */ + TidyDocFree(doc, charsetAttr->value); /* free current value */ charsetAttr->value = newValue; } - // Make sure it's the first element. + /* Make sure it's the first element. */ if ( currentNode != head->content->next ){ TY_(RemoveNode)( currentNode ); TY_(InsertNodeAtStart)( head, currentNode ); @@ -1893,51 +1912,110 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) continue; } /* - 2. - expected for HTML4. This is normally ok - but can clash. + 2. + expected for HTML4. This is normally ok - but can clash. */ if(httpEquivAttr && !charsetAttr) { - tidyBufClear(&charsetString); - tidyBufAppend(&charsetString, "charset=", 8); - tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc )); contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + if (!contentAttr) + continue; /* has no 'content' attribute */ + if (!httpEquivAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); - - if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; /* is not 'content-type' */ + if (!contentAttr->value) + { + prevNode = currentNode->prev; + /* maybe need better message here */ + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; continue; + } + /* check encoding matches + If a miss-match found here, fix it. previous silently done + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + */ lcontent = TY_(tmbstrtolower)(contentAttr->value); - if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){ - printf("WARN ABOUT CLASH: %s \n", contentAttr->value); + if (TY_(tmbsubstr)(lcontent, charsetString.bp)) + { + /* we already found one, so remove the rest. */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + } + else + { + /* fix a mis-match */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + else + { + /* correct the content */ + newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); + TidyDocFree(doc, contentAttr->value); + TY_(tmbstrcpy)(newValue, "text/html; charset="); + TY_(tmbstrcpy)(newValue + 19, enc); + contentAttr->value = newValue; + charsetFound = yes; + } } + continue; } /* - 3. - This is generally bad. + 3. + This is generally bad. Discard and warn. */ if(httpEquivAttr && charsetAttr) { - printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); + /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; } } - if(charsetFound){ - return yes; - } - metaTag = TY_(InferredTag)(doc, TidyTag_META); - switch(TY_(HTMLVersion)(doc)) + + /* completed head scan - add appropriate meta - if 'yes' and none exists */ + if (cfgBool(doc, TidyMetaCharset) && !charsetFound) { + /* add appropriate meta charset tag - no warning */ + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch (TY_(HTMLVersion)(doc)) + { case HT50: case XH50: - TY_(AddAttribute)( doc, metaTag, "charset", enc); + TY_(AddAttribute)(doc, metaTag, "charset", enc); break; default: tidyBufInit(&buf); - tidyBufAppend(&buf, "text/html; charset=", 19); - tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc)); - TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp); + tidyBufAppend(&buf, "text/html; ", 11); + tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); + tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)(head, metaTag); } - TY_(InsertNodeAtStart)( head, metaTag ); tidyBufFree(&charsetString); return yes; } diff --git a/src/tidylib.c b/src/tidylib.c index 811721bef..e2c443c84 100755 --- a/src/tidylib.c +++ b/src/tidylib.c @@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut ); Bool xmlDecl = cfgBool( doc, TidyXmlDecl ); Bool tidyMark = cfgBool( doc, TidyMark ); - Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset); Bool tidyXmlTags = cfgBool( doc, TidyXmlTags ); Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName ); Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis ); @@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) #endif /* Reconcile http-equiv meta element with output encoding */ - if (cfg( doc, TidyOutCharEncoding) != RAW -#ifndef NO_NATIVE_ISO2022_SUPPORT - && cfg( doc, TidyOutCharEncoding) != ISO2022 -#endif - ) - TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc )); + TY_(TidyMetaCharset)(doc); if ( !TY_(CheckNodeIntegrity)( &doc->root ) ) TidyPanic( doc->allocator, integrity ); @@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc ) if (tidyMark ) TY_(AddGenerator)(doc); - if (tidyMetaCharset) - TY_(TidyMetaCharset)(doc); } /* ensure presence of initial */ From f310f1d5de8ba7ce8adc1659b0e586a83ca8e47f Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 15 May 2017 16:39:53 +0200 Subject: [PATCH 11/19] Issue #456 - Move new TidyMetaCharset to clean --- src/clean.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/clean.h | 1 + src/lexer.c | 193 -------------------------------------------------- 3 files changed, 197 insertions(+), 194 deletions(-) diff --git a/src/clean.c b/src/clean.c index 8db77c7b3..0abf53aff 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) } #endif -/* Issue #456 - This is discarded */ +/* Issue #456 - This is discarded + See replacement TidyMetaCharset */ #if 0 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { @@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) } #endif +/*\ +* Issue #456 - Check meta charset +* 1. if there is no meta charset, it adds one, according to doctype, no warning. +* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? +* 3. if it doesn't match the output encoding, and fix. Naybe no warning? +* 4. if there are duplicates, discard them, with warning. +\*/ +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) +{ + AttVal *charsetAttr; + AttVal *contentAttr; + AttVal *httpEquivAttr; + Bool charsetFound = no; + uint outenc = cfg(doc, TidyOutCharEncoding); + ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); + Node *currentNode; + Node *head = TY_(FindHEAD)(doc); + Node *metaTag; + Node *prevNode; + TidyBuffer buf; + TidyBuffer charsetString; + tmbstr httpEquivAttrValue; + tmbstr lcontent; + tmbstr newValue; + /* We can't do anything we don't have a head or encoding is NULL */ + if (!head || !enc || !TY_(tmbstrlen)(enc)) + return no; + if (outenc == RAW) + return no; +#ifndef NO_NATIVE_ISO2022_SUPPORT + if (outenc == ISO2022) + return no; +#endif + + tidyBufInit(&charsetString); + /* Set up the content test 'charset=value' */ + tidyBufClear(&charsetString); + tidyBufAppend(&charsetString, "charset=", 8); + tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); + tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ + /* process the children of the head */ + for (currentNode = head->content; currentNode; currentNode = currentNode->next) + { + if (!nodeIsMETA(currentNode)) + continue; /* not a meta node */ + charsetAttr = attrGetCHARSET(currentNode); + httpEquivAttr = attrGetHTTP_EQUIV(currentNode); + if (!charsetAttr && !httpEquivAttr) + continue; /* has no charset attribute */ + /* + Meta charset comes in quite a few flavors: + 1. - expected for (X)HTML5. + */ + if (charsetAttr && !httpEquivAttr) + { + /* we already found one, so remove the rest. */ + if (charsetFound || !charsetAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + /* Fix mismatched attribute value */ + if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) + { + newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */ + TY_(tmbstrcpy)(newValue, enc); + /* Note: previously http-equiv had been modified, without warning + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); + */ + TidyDocFree(doc, charsetAttr->value); /* free current value */ + charsetAttr->value = newValue; + } + /* Make sure it's the first element. */ + if (currentNode != head->content->next) { + TY_(RemoveNode)(currentNode); + TY_(InsertNodeAtStart)(head, currentNode); + } + continue; + } + /* + 2. + expected for HTML4. This is normally ok - but can clash. + */ + if (httpEquivAttr && !charsetAttr) + { + contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); + if (!contentAttr) + continue; /* has no 'content' attribute */ + if (!httpEquivAttr->value) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); + if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + continue; /* is not 'content-type' */ + if (!contentAttr->value) + { + prevNode = currentNode->prev; + /* maybe need better message here */ + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + /* check encoding matches + If a miss-match found here, fix it. previous silently done + in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) + */ + lcontent = TY_(tmbstrtolower)(contentAttr->value); + if (TY_(tmbsubstr)(lcontent, charsetString.bp)) + { + /* we already found one, so remove the rest. */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + continue; + } + charsetFound = yes; + } + else + { + /* fix a mis-match */ + if (charsetFound) + { + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + else + { + /* correct the content */ + newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); + TidyDocFree(doc, contentAttr->value); + TY_(tmbstrcpy)(newValue, "text/html; charset="); + TY_(tmbstrcpy)(newValue + 19, enc); + contentAttr->value = newValue; + charsetFound = yes; + } + } + continue; + } + /* + 3. + This is generally bad. Discard and warn. + */ + if (httpEquivAttr && charsetAttr) + { + /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ + prevNode = currentNode->prev; + TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); + TY_(DiscardElement)(doc, currentNode); + currentNode = prevNode; + } + } + + /* completed head scan - add appropriate meta - if 'yes' and none exists */ + if (cfgBool(doc, TidyMetaCharset) && !charsetFound) + { + /* add appropriate meta charset tag - no warning */ + metaTag = TY_(InferredTag)(doc, TidyTag_META); + switch (TY_(HTMLVersion)(doc)) + { + case HT50: + case XH50: + TY_(AddAttribute)(doc, metaTag, "charset", enc); + break; + default: + tidyBufInit(&buf); + tidyBufAppend(&buf, "text/html; ", 11); + tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); + tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); + tidyBufFree(&buf); + } + TY_(InsertNodeAtStart)(head, metaTag); + } + tidyBufFree(&charsetString); + return yes; +} + + void TY_(DropComments)(TidyDocImpl* doc, Node* node) { Node* next; diff --git a/src/clean.h b/src/clean.h index d5d4117be..e538bcf7c 100644 --- a/src/clean.h +++ b/src/clean.h @@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent ); #if 0 void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent ); #endif +Bool TY_(TidyMetaCharset)(TidyDocImpl* doc); void TY_(DropComments)(TidyDocImpl* doc, Node* node); void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode); diff --git a/src/lexer.c b/src/lexer.c index b3832d965..238fbfa13 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1827,199 +1827,6 @@ Node *TY_(FindBody)( TidyDocImpl* doc ) return node; } -/*\ - * Issue #456 - Check meta charset - * 1. if there is no meta charset, it adds one, according to doctype, no warning. - * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required? - * 3. if it doesn't match the output encoding, and fix. Naybe no warning? - * 4. if there are duplicates, discard them, with warning. -\*/ -Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) -{ - AttVal *charsetAttr; - AttVal *contentAttr; - AttVal *httpEquivAttr; - Bool charsetFound = no; - uint outenc = cfg(doc, TidyOutCharEncoding); - ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc); - Node *currentNode; - Node *head = TY_(FindHEAD)( doc ); - Node *metaTag; - Node *prevNode; - TidyBuffer buf; - TidyBuffer charsetString; - tmbstr httpEquivAttrValue; - tmbstr lcontent; - tmbstr newValue; - /* We can't do anything we don't have a head or encoding is NULL */ - if( !head || !enc || !TY_(tmbstrlen)(enc)) - return no; - if (outenc == RAW) - return no; -#ifndef NO_NATIVE_ISO2022_SUPPORT - if (outenc == ISO2022) - return no; -#endif - - tidyBufInit(&charsetString); - /* Set up the content test 'charset=value' */ - tidyBufClear(&charsetString); - tidyBufAppend(&charsetString, "charset=", 8); - tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc)); - tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */ - /* process the children of the head */ - for (currentNode = head->content; currentNode; currentNode = currentNode->next) - { - if (!nodeIsMETA(currentNode)) - continue; /* not a meta node */ - charsetAttr = attrGetCHARSET(currentNode); - httpEquivAttr = attrGetHTTP_EQUIV(currentNode); - if(!charsetAttr && !httpEquivAttr) - continue; /* has no charset attribute */ - /* - Meta charset comes in quite a few flavors: - 1. - expected for (X)HTML5. - */ - if (charsetAttr && !httpEquivAttr) - { - /* we already found one, so remove the rest. */ - if(charsetFound || !charsetAttr->value) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)( doc, currentNode ); - currentNode = prevNode; - continue; - } - charsetFound = yes; - /* Fix mismatched attribute value */ - if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) - { - newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */ - TY_(tmbstrcpy)( newValue, enc ); - /* Note: previously http-equiv had been modified, without warning - in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); - */ - TidyDocFree(doc, charsetAttr->value); /* free current value */ - charsetAttr->value = newValue; - } - /* Make sure it's the first element. */ - if ( currentNode != head->content->next ){ - TY_(RemoveNode)( currentNode ); - TY_(InsertNodeAtStart)( head, currentNode ); - } - continue; - } - /* - 2. - expected for HTML4. This is normally ok - but can clash. - */ - if(httpEquivAttr && !charsetAttr) - { - contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT); - if (!contentAttr) - continue; /* has no 'content' attribute */ - if (!httpEquivAttr->value) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); - if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) - continue; /* is not 'content-type' */ - if (!contentAttr->value) - { - prevNode = currentNode->prev; - /* maybe need better message here */ - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - /* check encoding matches - If a miss-match found here, fix it. previous silently done - in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - */ - lcontent = TY_(tmbstrtolower)(contentAttr->value); - if (TY_(tmbsubstr)(lcontent, charsetString.bp)) - { - /* we already found one, so remove the rest. */ - if (charsetFound) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - continue; - } - charsetFound = yes; - } - else - { - /* fix a mis-match */ - if (charsetFound) - { - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - } - else - { - /* correct the content */ - newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); - TidyDocFree(doc, contentAttr->value); - TY_(tmbstrcpy)(newValue, "text/html; charset="); - TY_(tmbstrcpy)(newValue + 19, enc); - contentAttr->value = newValue; - charsetFound = yes; - } - } - continue; - } - /* - 3. - This is generally bad. Discard and warn. - */ - if(httpEquivAttr && charsetAttr) - { - /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */ - prevNode = currentNode->prev; - TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); - TY_(DiscardElement)(doc, currentNode); - currentNode = prevNode; - } - } - - /* completed head scan - add appropriate meta - if 'yes' and none exists */ - if (cfgBool(doc, TidyMetaCharset) && !charsetFound) - { - /* add appropriate meta charset tag - no warning */ - metaTag = TY_(InferredTag)(doc, TidyTag_META); - switch (TY_(HTMLVersion)(doc)) - { - case HT50: - case XH50: - TY_(AddAttribute)(doc, metaTag, "charset", enc); - break; - default: - tidyBufInit(&buf); - tidyBufAppend(&buf, "text/html; ", 11); - tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); - tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ - TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); - tidyBufFree(&buf); - } - TY_(InsertNodeAtStart)(head, metaTag); - } - tidyBufFree(&charsetString); - return yes; -} - /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ) { From a7a4cd6a1607ae06645f88734229f4d78811c36a Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 15 May 2017 16:42:49 +0200 Subject: [PATCH 12/19] Issue #456 - avoid head work if showing body only --- src/clean.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/clean.c b/src/clean.c index 0abf53aff..b4e9a38c8 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2321,6 +2321,8 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if (outenc == ISO2022) return no; #endif + if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState) + return no; /* nothing to do here if showing body only */ tidyBufInit(&charsetString); /* Set up the content test 'charset=value' */ From 21f008501abc8e386c37726ef0893cdbfd5f6007 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 15 May 2017 16:51:34 +0200 Subject: [PATCH 13/19] Issue #456 - Oops, also out of 'lexer.h' --- src/lexer.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/lexer.h b/src/lexer.h index a30e5d811..1d3d9cd7b 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -491,9 +491,6 @@ Node* TY_(FindXmlDecl)(TidyDocImpl* doc); /* Returns containing block element, if any */ Node* TY_(FindContainer)( Node* node ); -/* Adds meta element and sets the charset */ -Bool TY_(TidyMetaCharset)( TidyDocImpl* doc ); - /* add meta element for Tidy */ Bool TY_(AddGenerator)( TidyDocImpl* doc ); From 8a932f96eb1f243427cfa1b31f4529fd25f67a00 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sat, 27 May 2017 18:52:49 +0200 Subject: [PATCH 14/19] Issue #456 - Oops, incorrect merge conflict --- src/config.c | 68 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/config.c b/src/config.c index 3aa3d3805..8645ec10f 100644 --- a/src/config.c +++ b/src/config.c @@ -279,40 +279,40 @@ static const TidyOptionImpl option_defs[] = #if SUPPORT_ASIAN_ENCODINGS { TidyPunctWrap, PP, "punctuation-wrap", BL, no, ParsePickList, &boolPicks }, #endif - { TidyQuiet, MS, "quiet", BL, no, ParseBool, boolPicks }, - { TidyQuoteAmpersand, MU, "quote-ampersand", BL, yes, ParseBool, boolPicks }, - { TidyQuoteMarks, MU, "quote-marks", BL, no, ParseBool, boolPicks }, - { TidyQuoteNbsp, MU, "quote-nbsp", BL, yes, ParseBool, boolPicks }, - { TidyReplaceColor, MU, "replace-color", BL, no, ParseBool, boolPicks }, - { TidyShowErrors, DG, "show-errors", IN, 6, ParseInt, NULL }, - { TidyShowInfo, DG, "show-info", BL, yes, ParseBool, boolPicks }, - { TidyShowMarkup, PP, "markup", BL, yes, ParseBool, boolPicks }, - { TidyShowWarnings, DG, "show-warnings", BL, yes, ParseBool, boolPicks }, - { TidySkipNested, MU, "skip-nested", BL, yes, ParseBool, boolPicks }, /* 1642186 - Issue #65 */ - { TidySortAttributes, PP, "sort-attributes", IN, TidySortAttrNone,ParseSorter, sorterPicks }, - { TidyStrictTagsAttr, MU, "strict-tags-attributes", BL, no, ParseBool, boolPicks }, /* 20160209 - Issue #350 */ - { TidyTabSize, PP, "tab-size", IN, 8, ParseInt, NULL }, - { TidyUpperCaseAttrs, MU, "uppercase-attributes", BL, no, ParseBool, boolPicks }, - { TidyUpperCaseTags, MU, "uppercase-tags", BL, no, ParseBool, boolPicks }, - { TidyUseCustomTags, MU, "custom-tags", IN, TidyCustomNo, ParseUseCustomTags,customTagsPicks }, /* 20170309 - Issue #119 */ - { TidyVertSpace, PP, "vertical-space", IN, no, ParseAutoBool, autoBoolPicks }, /* #228 - tri option */ - { TidyWarnPropAttrs, MU, "warn-proprietary-attributes", BL, yes, ParseBool, boolPicks }, - { TidyWord2000, MU, "word-2000", BL, no, ParseBool, boolPicks }, - { TidyWrapAsp, PP, "wrap-asp", BL, yes, ParseBool, boolPicks }, - { TidyWrapAttVals, PP, "wrap-attributes", BL, no, ParseBool, boolPicks }, - { TidyWrapJste, PP, "wrap-jste", BL, yes, ParseBool, boolPicks }, - { TidyWrapLen, PP, "wrap", IN, 68, ParseInt, NULL }, - { TidyWrapPhp, PP, "wrap-php", BL, yes, ParseBool, boolPicks }, - { TidyWrapScriptlets, PP, "wrap-script-literals", BL, no, ParseBool, boolPicks }, - { TidyWrapSection, PP, "wrap-sections", BL, yes, ParseBool, boolPicks }, - { TidyWriteBack, MS, "write-back", BL, no, ParseBool, boolPicks }, - { TidyXhtmlOut, MU, "output-xhtml", BL, no, ParseBool, boolPicks }, - { TidyXmlDecl, MU, "add-xml-decl", BL, no, ParseBool, boolPicks }, - { TidyXmlOut, MU, "output-xml", BL, no, ParseBool, boolPicks }, - { TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParseBool, boolPicks }, - { TidyXmlSpace, MU, "add-xml-space", BL, no, ParseBool, boolPicks }, - { TidyXmlTags, MU, "input-xml", BL, no, ParseBool, boolPicks }, - { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParseBool, boolPicks }, /* 20161004 - Issue #456 */ + { TidyQuiet, MS, "quiet", BL, no, ParsePickList, &boolPicks }, + { TidyQuoteAmpersand, MU, "quote-ampersand", BL, yes, ParsePickList, &boolPicks }, + { TidyQuoteMarks, MU, "quote-marks", BL, no, ParsePickList, &boolPicks }, + { TidyQuoteNbsp, MU, "quote-nbsp", BL, yes, ParsePickList, &boolPicks }, + { TidyReplaceColor, MU, "replace-color", BL, no, ParsePickList, &boolPicks }, + { TidyShowErrors, DG, "show-errors", IN, 6, ParseInt, NULL }, + { TidyShowInfo, DG, "show-info", BL, yes, ParsePickList, &boolPicks }, + { TidyShowMarkup, PP, "markup", BL, yes, ParsePickList, &boolPicks }, + { TidyShowWarnings, DG, "show-warnings", BL, yes, ParsePickList, &boolPicks }, + { TidySkipNested, MU, "skip-nested", BL, yes, ParsePickList, &boolPicks }, /* 1642186 - Issue #65 */ + { TidySortAttributes, PP, "sort-attributes", IN, TidySortAttrNone,ParsePickList, &sorterPicks }, + { TidyStrictTagsAttr, MU, "strict-tags-attributes", BL, no, ParsePickList, &boolPicks }, /* 20160209 - Issue #350 */ + { TidyTabSize, PP, "tab-size", IN, 8, ParseInt, NULL }, + { TidyUpperCaseAttrs, MU, "uppercase-attributes", IN, TidyUppercaseNo, ParsePickList, &attributeCasePicks }, + { TidyUpperCaseTags, MU, "uppercase-tags", BL, no, ParsePickList, &boolPicks }, + { TidyUseCustomTags, MU, "custom-tags", IN, TidyCustomNo, ParsePickList, &customTagsPicks }, /* 20170309 - Issue #119 */ + { TidyVertSpace, PP, "vertical-space", IN, no, ParsePickList, &autoBoolPicks }, /* #228 - tri option */ + { TidyWarnPropAttrs, MU, "warn-proprietary-attributes", BL, yes, ParsePickList, &boolPicks }, + { TidyWord2000, MU, "word-2000", BL, no, ParsePickList, &boolPicks }, + { TidyWrapAsp, PP, "wrap-asp", BL, yes, ParsePickList, &boolPicks }, + { TidyWrapAttVals, PP, "wrap-attributes", BL, no, ParsePickList, &boolPicks }, + { TidyWrapJste, PP, "wrap-jste", BL, yes, ParsePickList, &boolPicks }, + { TidyWrapLen, PP, "wrap", IN, 68, ParseInt, NULL }, + { TidyWrapPhp, PP, "wrap-php", BL, yes, ParsePickList, &boolPicks }, + { TidyWrapScriptlets, PP, "wrap-script-literals", BL, no, ParsePickList, &boolPicks }, + { TidyWrapSection, PP, "wrap-sections", BL, yes, ParsePickList, &boolPicks }, + { TidyWriteBack, MS, "write-back", BL, no, ParsePickList, &boolPicks }, + { TidyXhtmlOut, MU, "output-xhtml", BL, no, ParsePickList, &boolPicks }, + { TidyXmlDecl, MU, "add-xml-decl", BL, no, ParsePickList, &boolPicks }, + { TidyXmlOut, MU, "output-xml", BL, no, ParsePickList, &boolPicks }, + { TidyXmlPIs, MU, "assume-xml-procins", BL, no, ParsePickList, &boolPicks }, + { TidyXmlSpace, MU, "add-xml-space", BL, no, ParsePickList, &boolPicks }, + { TidyXmlTags, MU, "input-xml", BL, no, ParsePickList, &boolPicks }, + { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParsePickList, &boolPicks }, /* 20161004 - Issue #456 */ { N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL } }; From 40e1d64963ab082385e4fd3c0d685753999a7577 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sat, 27 May 2017 20:13:51 +0200 Subject: [PATCH 15/19] Issue #456 - A desparate commit to get this WIP right, but... --- src/clean.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/clean.c b/src/clean.c index b4e9a38c8..8ed14793a 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2210,7 +2210,6 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) /* Issue #456 - This is discarded See replacement TidyMetaCharset */ -#if 0 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { Node *pNode; @@ -2286,7 +2285,6 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) pLastProp = NULL; } } -#endif /*\ * Issue #456 - Check meta charset @@ -2309,9 +2307,11 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) Node *prevNode; TidyBuffer buf; TidyBuffer charsetString; - tmbstr httpEquivAttrValue; - tmbstr lcontent; + /* tmbstr httpEquivAttrValue; */ + /* tmbstr lcontent; */ tmbstr newValue; + Bool add_meta = cfgBool(doc, TidyMetaCharset); + /* We can't do anything we don't have a head or encoding is NULL */ if (!head || !enc || !TY_(tmbstrlen)(enc)) return no; @@ -2324,6 +2324,11 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState) return no; /* nothing to do here if showing body only */ + if (!add_meta) { + TY_(VerifyHTTPEquiv)(doc, head); + return no; + } + tidyBufInit(&charsetString); /* Set up the content test 'charset=value' */ tidyBufClear(&charsetString); @@ -2356,7 +2361,7 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) } charsetFound = yes; /* Fix mismatched attribute value */ - if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0) + if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0) { newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */ TY_(tmbstrcpy)(newValue, enc); @@ -2391,24 +2396,30 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) currentNode = prevNode; continue; } - httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); - if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) + /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */ + if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0) continue; /* is not 'content-type' */ if (!contentAttr->value) { + /* While this **seems** like a good idea, current tidy accepts this + see reg.test case-1117013.html which contains + so for now. This could be reviewed + in future, since there seem no need to keep this invalid meta */ +#if 0 /* 0000000000000000000000000000000000000000000000000 */ prevNode = currentNode->prev; /* maybe need better message here */ TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED); TY_(DiscardElement)(doc, currentNode); currentNode = prevNode; - continue; +#endif /* 000000000000000000000000000000000000000000000000 */ + continue; /* has no 'content' attribute has NO VALUE! */ } /* check encoding matches If a miss-match found here, fix it. previous silently done in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - */ lcontent = TY_(tmbstrtolower)(contentAttr->value); - if (TY_(tmbsubstr)(lcontent, charsetString.bp)) + */ + if (TY_(tmbstrcasecmp)(contentAttr->value, charsetString.bp)) { /* we already found one, so remove the rest. */ if (charsetFound) From 13b34c9d8b4c2e9322034d8fc8b5fc028d04017b Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sun, 4 Jun 2017 15:41:16 +0200 Subject: [PATCH 16/19] Issue #456 - BAH! Fix a stupid logic reversal --- src/clean.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/clean.c b/src/clean.c index 8ed14793a..a4eb9fb64 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2210,6 +2210,7 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent ) /* Issue #456 - This is discarded See replacement TidyMetaCharset */ +#if 0 /* 000000000000000000000000 */ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) { Node *pNode; @@ -2285,6 +2286,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) pLastProp = NULL; } } +#endif /* 000000000000000000000000 */ /*\ * Issue #456 - Check meta charset @@ -2324,10 +2326,12 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState) return no; /* nothing to do here if showing body only */ +#if 0 /* 000000000000000000000000 */ if (!add_meta) { TY_(VerifyHTTPEquiv)(doc, head); return no; } +#endif /* 000000000000000000000000 */ tidyBufInit(&charsetString); /* Set up the content test 'charset=value' */ @@ -2419,7 +2423,7 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) lcontent = TY_(tmbstrtolower)(contentAttr->value); */ - if (TY_(tmbstrcasecmp)(contentAttr->value, charsetString.bp)) + if (TY_(tmbstrcasecmp)(contentAttr->value, charsetString.bp) == 0) { /* we already found one, so remove the rest. */ if (charsetFound) @@ -2470,7 +2474,7 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) } /* completed head scan - add appropriate meta - if 'yes' and none exists */ - if (cfgBool(doc, TidyMetaCharset) && !charsetFound) + if (add_meta && !charsetFound) { /* add appropriate meta charset tag - no warning */ metaTag = TY_(InferredTag)(doc, TidyTag_META); From a4770daa2ba73c7a7fd50acc95d5cfb63992f4c9 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sun, 4 Jun 2017 20:34:22 +0200 Subject: [PATCH 17/19] Issue #456 - Add 'Info:' message, when meta added. It also fixes the addition of the constant 'http-equiv="Content-Type" attribute. --- include/tidyenum.h | 3 ++- src/clean.c | 4 +++- src/language_en.h | 4 +++- src/message.c | 5 +++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/tidyenum.h b/include/tidyenum.h index 95b6e604b..bc76cf388 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -270,7 +270,8 @@ extern "C" { FN(COERCE_TO_ENDTAG) \ FN(ELEMENT_NOT_EMPTY) \ FN(UNEXPECTED_END_OF_FILE) \ - FN(UNEXPECTED_ENDTAG) + FN(UNEXPECTED_ENDTAG) \ + FN(ADDED_MISSING_CHARSET) /** These are report messages added by Tidy's accessibility module. */ diff --git a/src/clean.c b/src/clean.c index a4eb9fb64..76b6acf44 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2489,10 +2489,12 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) tidyBufAppend(&buf, "text/html; ", 11); tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp)); tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */ - TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); + TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */ + TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp); /* add 'content=""' */ tidyBufFree(&buf); } TY_(InsertNodeAtStart)(head, metaTag); + TY_(ReportError)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */ } tidyBufFree(&charsetString); return yes; diff --git a/src/language_en.h b/src/language_en.h index f9618c845..b971db151 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1900,7 +1900,9 @@ static languageDefinition language_en = { whichPluralForm_en, { { ELEMENT_NOT_EMPTY, 0, "%s element not empty or not closed" }, /* ReportError, ReportAttrError */ { UNEXPECTED_END_OF_FILE, 0, "unexpected end of file %s" }, /* ReportError, ReportAttrError */ { UNEXPECTED_ENDTAG, 0, "unexpected " }, /* ReportError, ReportFatal */ - + + { ADDED_MISSING_CHARSET, 0, "Added appropriate missing to %s" }, /* ReportInfo */ + #if SUPPORT_ACCESSIBILITY_CHECKS /*************************************** diff --git a/src/message.c b/src/message.c index 9b86bb558..5330e3e60 100755 --- a/src/message.c +++ b/src/message.c @@ -408,6 +408,11 @@ void TY_(ReportError)(TidyDocImpl* doc, Node *element, Node *node, uint code) case REMOVED_HTML5: message = TY_(tidyMessageCreateWithNode)(doc, rpt, code, TidyError, nodedesc ); break; + + case ADDED_MISSING_CHARSET: + message = TY_(tidyMessageCreateWithNode)(doc, rpt, code, TidyInfo, nodedesc); + break; + } messageOut( message ); From 97292646f6b008f6008b7a8b93e104e1e9f88198 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Mon, 5 Jun 2017 17:16:53 +0200 Subject: [PATCH 18/19] Issue #456 - Add 'Info:' message when charset replaced --- include/tidyenum.h | 1 + src/clean.c | 5 +++-- src/language_en.h | 1 + src/message.c | 4 ++++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/tidyenum.h b/include/tidyenum.h index bc76cf388..fecce0990 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -169,6 +169,7 @@ extern "C" { FN(BACKSLASH_IN_URI) \ FN(BAD_ATTRIBUTE_VALUE_REPLACED) \ FN(BAD_ATTRIBUTE_VALUE) \ + FN(ATTRIBUTE_VALUE_REPLACED) \ FN(ESCAPED_ILLEGAL_URI) \ FN(FIXED_BACKSLASH) \ FN(ID_NAME_MISMATCH) \ diff --git a/src/clean.c b/src/clean.c index 76b6acf44..819767dc2 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2371,8 +2371,8 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) TY_(tmbstrcpy)(newValue, enc); /* Note: previously http-equiv had been modified, without warning in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head) - TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED ); */ + TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED); TidyDocFree(doc, charsetAttr->value); /* free current value */ charsetAttr->value = newValue; } @@ -2450,9 +2450,10 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) { /* correct the content */ newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); - TidyDocFree(doc, contentAttr->value); TY_(tmbstrcpy)(newValue, "text/html; charset="); TY_(tmbstrcpy)(newValue + 19, enc); + TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED); + TidyDocFree(doc, contentAttr->value); contentAttr->value = newValue; charsetFound = yes; } diff --git a/src/language_en.h b/src/language_en.h index b971db151..a84b6ed4e 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1798,6 +1798,7 @@ static languageDefinition language_en = { whichPluralForm_en, { { BACKSLASH_IN_URI, 0, "%s URI reference contains backslash. Typo?" }, /* ReportAttrError */ { BAD_ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\" had invalid value \"%s\" and has been replaced" }, /* ReportAttrError */ { BAD_ATTRIBUTE_VALUE, 0, "%s attribute \"%s\" has invalid value \"%s\"" }, /* ReportAttrError */ + { ATTRIBUTE_VALUE_REPLACED, 0, "%s attribute \"%s\", incorrect value \"%s\" replaced" }, /* ReportAttrError/TidyInfo */ { ESCAPED_ILLEGAL_URI, 0, "%s escaping malformed URI reference" }, /* ReportAttrError */ { FIXED_BACKSLASH, 0, "%s converting backslash in URI to slash" }, /* ReportAttrError */ { ID_NAME_MISMATCH, 0, "%s id and name attribute value mismatch" }, /* ReportAttrError */ diff --git a/src/message.c b/src/message.c index 5330e3e60..3ab40e764 100755 --- a/src/message.c +++ b/src/message.c @@ -526,6 +526,10 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code) message = TY_(tidyMessageCreateWithNode)(doc, node, code, TidyWarning, tagdesc, name, value ); break; + case ATTRIBUTE_VALUE_REPLACED: + message = TY_(tidyMessageCreateWithNode)(doc, node, code, TidyInfo, tagdesc, name, value); + break; + case UNEXPECTED_QUOTEMARK: case MISSING_QUOTEMARK: case ID_NAME_MISMATCH: From b32e14a8eab1e14bb0e02ee804abe30fc94759d9 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Fri, 9 Jun 2017 03:11:39 +0200 Subject: [PATCH 19/19] Issue #456 - add new option `show-meta-change` --- include/tidyenum.h | 1 + src/clean.c | 3 ++- src/config.c | 1 + src/language_en.h | 7 ++++++- 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/tidyenum.h b/include/tidyenum.h index fecce0990..f7799d0d4 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -651,6 +651,7 @@ typedef enum TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */ TidyXmlTags, /**< Treat input as XML */ TidyMetaCharset, /**< Adds/checks/fixes meta charset in the head, based on document type */ + TidyShowMetaChange, /**< show when meta http-equiv content charset was changed - compatibility */ N_TIDY_OPTIONS /**< Must be last */ } TidyOptionId; diff --git a/src/clean.c b/src/clean.c index 819767dc2..f3f06016c 100644 --- a/src/clean.c +++ b/src/clean.c @@ -2452,7 +2452,8 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc) newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1); TY_(tmbstrcpy)(newValue, "text/html; charset="); TY_(tmbstrcpy)(newValue + 19, enc); - TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED); + if (cfgBool(doc, TidyShowMetaChange)) /* Issue #456 - backward compatibility only */ + TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED); TidyDocFree(doc, contentAttr->value); contentAttr->value = newValue; charsetFound = yes; diff --git a/src/config.c b/src/config.c index 8645ec10f..a09ecce61 100644 --- a/src/config.c +++ b/src/config.c @@ -313,6 +313,7 @@ static const TidyOptionImpl option_defs[] = { TidyXmlSpace, MU, "add-xml-space", BL, no, ParsePickList, &boolPicks }, { TidyXmlTags, MU, "input-xml", BL, no, ParsePickList, &boolPicks }, { TidyMetaCharset, MS, "add-meta-charset", BL, no, ParsePickList, &boolPicks }, /* 20161004 - Issue #456 */ + { TidyShowMetaChange, MS, "show-meta-change", BL, no, ParsePickList, &boolPicks }, /* 20170609 - Issue #456 */ { N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL } }; diff --git a/src/language_en.h b/src/language_en.h index a84b6ed4e..cd3fae179 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1522,7 +1522,12 @@ static languageDefinition language_en = { whichPluralForm_en, { be translated. */ TidyMetaCharset, 0, "This option adds a meta element and sets the charset attribute to the encoding of the document." - "Set this option to 'yes' if you want this." + " Set this option to yes if you want this." + }, + { + TidyShowMetaChange, 0, + "This option shows when a meta http-equiv content charset attribute was changed to the encoding of the document." + " Set this option to yes if you want this." }, /********************************************