@@ -2208,7 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2208
2208
}
2209
2209
#endif
2210
2210
2211
- /* Issue #456 - This is discarded */
2211
+ /* Issue #456 - This is discarded
2212
+ See replacement TidyMetaCharset */
2212
2213
#if 0
2213
2214
void TY_ (VerifyHTTPEquiv )(TidyDocImpl * doc , Node * head )
2214
2215
{
@@ -2287,6 +2288,200 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2287
2288
}
2288
2289
#endif
2289
2290
2291
+ /*\
2292
+ * Issue #456 - Check meta charset
2293
+ * 1. if there is no meta charset, it adds one, according to doctype, no warning.
2294
+ * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
2295
+ * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
2296
+ * 4. if there are duplicates, discard them, with warning.
2297
+ \*/
2298
+ Bool TY_ (TidyMetaCharset )(TidyDocImpl * doc )
2299
+ {
2300
+ AttVal * charsetAttr ;
2301
+ AttVal * contentAttr ;
2302
+ AttVal * httpEquivAttr ;
2303
+ Bool charsetFound = no ;
2304
+ uint outenc = cfg (doc , TidyOutCharEncoding );
2305
+ ctmbstr enc = TY_ (GetEncodingNameFromTidyId )(outenc );
2306
+ Node * currentNode ;
2307
+ Node * head = TY_ (FindHEAD )(doc );
2308
+ Node * metaTag ;
2309
+ Node * prevNode ;
2310
+ TidyBuffer buf ;
2311
+ TidyBuffer charsetString ;
2312
+ tmbstr httpEquivAttrValue ;
2313
+ tmbstr lcontent ;
2314
+ tmbstr newValue ;
2315
+ /* We can't do anything we don't have a head or encoding is NULL */
2316
+ if (!head || !enc || !TY_ (tmbstrlen )(enc ))
2317
+ return no ;
2318
+ if (outenc == RAW )
2319
+ return no ;
2320
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
2321
+ if (outenc == ISO2022 )
2322
+ return no ;
2323
+ #endif
2324
+
2325
+ tidyBufInit (& charsetString );
2326
+ /* Set up the content test 'charset=value' */
2327
+ tidyBufClear (& charsetString );
2328
+ tidyBufAppend (& charsetString , "charset=" , 8 );
2329
+ tidyBufAppend (& charsetString , (char * )enc , TY_ (tmbstrlen )(enc ));
2330
+ tidyBufAppend (& charsetString , "\0" , 1 ); /* zero terminate the buffer */
2331
+ /* process the children of the head */
2332
+ for (currentNode = head -> content ; currentNode ; currentNode = currentNode -> next )
2333
+ {
2334
+ if (!nodeIsMETA (currentNode ))
2335
+ continue ; /* not a meta node */
2336
+ charsetAttr = attrGetCHARSET (currentNode );
2337
+ httpEquivAttr = attrGetHTTP_EQUIV (currentNode );
2338
+ if (!charsetAttr && !httpEquivAttr )
2339
+ continue ; /* has no charset attribute */
2340
+ /*
2341
+ Meta charset comes in quite a few flavors:
2342
+ 1. <meta charset="value"> - expected for (X)HTML5.
2343
+ */
2344
+ if (charsetAttr && !httpEquivAttr )
2345
+ {
2346
+ /* we already found one, so remove the rest. */
2347
+ if (charsetFound || !charsetAttr -> value )
2348
+ {
2349
+ prevNode = currentNode -> prev ;
2350
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2351
+ TY_ (DiscardElement )(doc , currentNode );
2352
+ currentNode = prevNode ;
2353
+ continue ;
2354
+ }
2355
+ charsetFound = yes ;
2356
+ /* Fix mismatched attribute value */
2357
+ if (TY_ (tmbstrcmp )(TY_ (tmbstrtolower )(charsetAttr -> value ), enc ) != 0 )
2358
+ {
2359
+ newValue = (tmbstr )TidyDocAlloc (doc , TY_ (tmbstrlen )(enc ) + 1 ); /* allocate + 1 for 0 */
2360
+ TY_ (tmbstrcpy )(newValue , enc );
2361
+ /* Note: previously http-equiv had been modified, without warning
2362
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2363
+ TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
2364
+ */
2365
+ TidyDocFree (doc , charsetAttr -> value ); /* free current value */
2366
+ charsetAttr -> value = newValue ;
2367
+ }
2368
+ /* Make sure it's the first element. */
2369
+ if (currentNode != head -> content -> next ) {
2370
+ TY_ (RemoveNode )(currentNode );
2371
+ TY_ (InsertNodeAtStart )(head , currentNode );
2372
+ }
2373
+ continue ;
2374
+ }
2375
+ /*
2376
+ 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
2377
+ expected for HTML4. This is normally ok - but can clash.
2378
+ */
2379
+ if (httpEquivAttr && !charsetAttr )
2380
+ {
2381
+ contentAttr = TY_ (AttrGetById )(currentNode , TidyAttr_CONTENT );
2382
+ if (!contentAttr )
2383
+ continue ; /* has no 'content' attribute */
2384
+ if (!httpEquivAttr -> value )
2385
+ {
2386
+ prevNode = currentNode -> prev ;
2387
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2388
+ TY_ (DiscardElement )(doc , currentNode );
2389
+ currentNode = prevNode ;
2390
+ continue ;
2391
+ }
2392
+ httpEquivAttrValue = TY_ (tmbstrtolower )(httpEquivAttr -> value );
2393
+ if (TY_ (tmbstrcmp )(httpEquivAttr -> value , (tmbstr ) "content-type" ) != 0 )
2394
+ continue ; /* is not 'content-type' */
2395
+ if (!contentAttr -> value )
2396
+ {
2397
+ prevNode = currentNode -> prev ;
2398
+ /* maybe need better message here */
2399
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2400
+ TY_ (DiscardElement )(doc , currentNode );
2401
+ currentNode = prevNode ;
2402
+ continue ;
2403
+ }
2404
+ /* check encoding matches
2405
+ If a miss-match found here, fix it. previous silently done
2406
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2407
+ */
2408
+ lcontent = TY_ (tmbstrtolower )(contentAttr -> value );
2409
+ if (TY_ (tmbsubstr )(lcontent , charsetString .bp ))
2410
+ {
2411
+ /* we already found one, so remove the rest. */
2412
+ if (charsetFound )
2413
+ {
2414
+ prevNode = currentNode -> prev ;
2415
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2416
+ TY_ (DiscardElement )(doc , currentNode );
2417
+ currentNode = prevNode ;
2418
+ continue ;
2419
+ }
2420
+ charsetFound = yes ;
2421
+ }
2422
+ else
2423
+ {
2424
+ /* fix a mis-match */
2425
+ if (charsetFound )
2426
+ {
2427
+ prevNode = currentNode -> prev ;
2428
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2429
+ TY_ (DiscardElement )(doc , currentNode );
2430
+ currentNode = prevNode ;
2431
+ }
2432
+ else
2433
+ {
2434
+ /* correct the content */
2435
+ newValue = (tmbstr )TidyDocAlloc (doc , 19 + TY_ (tmbstrlen )(enc ) + 1 );
2436
+ TidyDocFree (doc , contentAttr -> value );
2437
+ TY_ (tmbstrcpy )(newValue , "text/html; charset=" );
2438
+ TY_ (tmbstrcpy )(newValue + 19 , enc );
2439
+ contentAttr -> value = newValue ;
2440
+ charsetFound = yes ;
2441
+ }
2442
+ }
2443
+ continue ;
2444
+ }
2445
+ /*
2446
+ 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
2447
+ This is generally bad. Discard and warn.
2448
+ */
2449
+ if (httpEquivAttr && charsetAttr )
2450
+ {
2451
+ /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
2452
+ prevNode = currentNode -> prev ;
2453
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
2454
+ TY_ (DiscardElement )(doc , currentNode );
2455
+ currentNode = prevNode ;
2456
+ }
2457
+ }
2458
+
2459
+ /* completed head scan - add appropriate meta - if 'yes' and none exists */
2460
+ if (cfgBool (doc , TidyMetaCharset ) && !charsetFound )
2461
+ {
2462
+ /* add appropriate meta charset tag - no warning */
2463
+ metaTag = TY_ (InferredTag )(doc , TidyTag_META );
2464
+ switch (TY_ (HTMLVersion )(doc ))
2465
+ {
2466
+ case HT50 :
2467
+ case XH50 :
2468
+ TY_ (AddAttribute )(doc , metaTag , "charset" , enc );
2469
+ break ;
2470
+ default :
2471
+ tidyBufInit (& buf );
2472
+ tidyBufAppend (& buf , "text/html; " , 11 );
2473
+ tidyBufAppend (& buf , charsetString .bp , TY_ (tmbstrlen )(charsetString .bp ));
2474
+ tidyBufAppend (& buf , "\0" , 1 ); /* zero terminate the buffer */
2475
+ TY_ (AddAttribute )(doc , metaTag , "content" , (char * )buf .bp );
2476
+ tidyBufFree (& buf );
2477
+ }
2478
+ TY_ (InsertNodeAtStart )(head , metaTag );
2479
+ }
2480
+ tidyBufFree (& charsetString );
2481
+ return yes ;
2482
+ }
2483
+
2484
+
2290
2485
void TY_ (DropComments )(TidyDocImpl * doc , Node * node )
2291
2486
{
2292
2487
Node * next ;
0 commit comments