Skip to content

Commit f310f1d

Browse files
committed
Issue #456 - Move new TidyMetaCharset to clean
1 parent 6ebd12b commit f310f1d

File tree

3 files changed

+197
-194
lines changed

3 files changed

+197
-194
lines changed

src/clean.c

Lines changed: 196 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
22082208
}
22092209
#endif
22102210

2211-
/* Issue #456 - This is discarded */
2211+
/* Issue #456 - This is discarded
2212+
See replacement TidyMetaCharset */
22122213
#if 0
22132214
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
22142215
{
@@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
22872288
}
22882289
#endif
22892290

2291+
/*\
2292+
* Issue #456 - Check meta charset
2293+
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
2294+
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
2295+
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
2296+
* 4. if there are duplicates, discard them, with warning.
2297+
\*/
2298+
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
2299+
{
2300+
AttVal *charsetAttr;
2301+
AttVal *contentAttr;
2302+
AttVal *httpEquivAttr;
2303+
Bool charsetFound = no;
2304+
uint outenc = cfg(doc, TidyOutCharEncoding);
2305+
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
2306+
Node *currentNode;
2307+
Node *head = TY_(FindHEAD)(doc);
2308+
Node *metaTag;
2309+
Node *prevNode;
2310+
TidyBuffer buf;
2311+
TidyBuffer charsetString;
2312+
tmbstr httpEquivAttrValue;
2313+
tmbstr lcontent;
2314+
tmbstr newValue;
2315+
/* We can't do anything we don't have a head or encoding is NULL */
2316+
if (!head || !enc || !TY_(tmbstrlen)(enc))
2317+
return no;
2318+
if (outenc == RAW)
2319+
return no;
2320+
#ifndef NO_NATIVE_ISO2022_SUPPORT
2321+
if (outenc == ISO2022)
2322+
return no;
2323+
#endif
2324+
2325+
tidyBufInit(&charsetString);
2326+
/* Set up the content test 'charset=value' */
2327+
tidyBufClear(&charsetString);
2328+
tidyBufAppend(&charsetString, "charset=", 8);
2329+
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
2330+
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
2331+
/* process the children of the head */
2332+
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
2333+
{
2334+
if (!nodeIsMETA(currentNode))
2335+
continue; /* not a meta node */
2336+
charsetAttr = attrGetCHARSET(currentNode);
2337+
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
2338+
if (!charsetAttr && !httpEquivAttr)
2339+
continue; /* has no charset attribute */
2340+
/*
2341+
Meta charset comes in quite a few flavors:
2342+
1. <meta charset="value"> - expected for (X)HTML5.
2343+
*/
2344+
if (charsetAttr && !httpEquivAttr)
2345+
{
2346+
/* we already found one, so remove the rest. */
2347+
if (charsetFound || !charsetAttr->value)
2348+
{
2349+
prevNode = currentNode->prev;
2350+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2351+
TY_(DiscardElement)(doc, currentNode);
2352+
currentNode = prevNode;
2353+
continue;
2354+
}
2355+
charsetFound = yes;
2356+
/* Fix mismatched attribute value */
2357+
if (TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
2358+
{
2359+
newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1); /* allocate + 1 for 0 */
2360+
TY_(tmbstrcpy)(newValue, enc);
2361+
/* Note: previously http-equiv had been modified, without warning
2362+
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2363+
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
2364+
*/
2365+
TidyDocFree(doc, charsetAttr->value); /* free current value */
2366+
charsetAttr->value = newValue;
2367+
}
2368+
/* Make sure it's the first element. */
2369+
if (currentNode != head->content->next) {
2370+
TY_(RemoveNode)(currentNode);
2371+
TY_(InsertNodeAtStart)(head, currentNode);
2372+
}
2373+
continue;
2374+
}
2375+
/*
2376+
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
2377+
expected for HTML4. This is normally ok - but can clash.
2378+
*/
2379+
if (httpEquivAttr && !charsetAttr)
2380+
{
2381+
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
2382+
if (!contentAttr)
2383+
continue; /* has no 'content' attribute */
2384+
if (!httpEquivAttr->value)
2385+
{
2386+
prevNode = currentNode->prev;
2387+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2388+
TY_(DiscardElement)(doc, currentNode);
2389+
currentNode = prevNode;
2390+
continue;
2391+
}
2392+
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
2393+
if (TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
2394+
continue; /* is not 'content-type' */
2395+
if (!contentAttr->value)
2396+
{
2397+
prevNode = currentNode->prev;
2398+
/* maybe need better message here */
2399+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2400+
TY_(DiscardElement)(doc, currentNode);
2401+
currentNode = prevNode;
2402+
continue;
2403+
}
2404+
/* check encoding matches
2405+
If a miss-match found here, fix it. previous silently done
2406+
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2407+
*/
2408+
lcontent = TY_(tmbstrtolower)(contentAttr->value);
2409+
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
2410+
{
2411+
/* we already found one, so remove the rest. */
2412+
if (charsetFound)
2413+
{
2414+
prevNode = currentNode->prev;
2415+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2416+
TY_(DiscardElement)(doc, currentNode);
2417+
currentNode = prevNode;
2418+
continue;
2419+
}
2420+
charsetFound = yes;
2421+
}
2422+
else
2423+
{
2424+
/* fix a mis-match */
2425+
if (charsetFound)
2426+
{
2427+
prevNode = currentNode->prev;
2428+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2429+
TY_(DiscardElement)(doc, currentNode);
2430+
currentNode = prevNode;
2431+
}
2432+
else
2433+
{
2434+
/* correct the content */
2435+
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
2436+
TidyDocFree(doc, contentAttr->value);
2437+
TY_(tmbstrcpy)(newValue, "text/html; charset=");
2438+
TY_(tmbstrcpy)(newValue + 19, enc);
2439+
contentAttr->value = newValue;
2440+
charsetFound = yes;
2441+
}
2442+
}
2443+
continue;
2444+
}
2445+
/*
2446+
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
2447+
This is generally bad. Discard and warn.
2448+
*/
2449+
if (httpEquivAttr && charsetAttr)
2450+
{
2451+
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
2452+
prevNode = currentNode->prev;
2453+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2454+
TY_(DiscardElement)(doc, currentNode);
2455+
currentNode = prevNode;
2456+
}
2457+
}
2458+
2459+
/* completed head scan - add appropriate meta - if 'yes' and none exists */
2460+
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
2461+
{
2462+
/* add appropriate meta charset tag - no warning */
2463+
metaTag = TY_(InferredTag)(doc, TidyTag_META);
2464+
switch (TY_(HTMLVersion)(doc))
2465+
{
2466+
case HT50:
2467+
case XH50:
2468+
TY_(AddAttribute)(doc, metaTag, "charset", enc);
2469+
break;
2470+
default:
2471+
tidyBufInit(&buf);
2472+
tidyBufAppend(&buf, "text/html; ", 11);
2473+
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
2474+
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
2475+
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
2476+
tidyBufFree(&buf);
2477+
}
2478+
TY_(InsertNodeAtStart)(head, metaTag);
2479+
}
2480+
tidyBufFree(&charsetString);
2481+
return yes;
2482+
}
2483+
2484+
22902485
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
22912486
{
22922487
Node* next;

src/clean.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
6767
#if 0
6868
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
6969
#endif
70+
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc);
7071

7172
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
7273
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);

0 commit comments

Comments
 (0)