Skip to content

Commit a417f6b

Browse files
openandclosedomenic
authored andcommitted
Fix index computation after search() in <meta> extraction
Closes #6. Closes #7.
1 parent c5e8fe3 commit a417f6b

4 files changed

+78
-7
lines changed

lib/html-encoding-sniffer.js

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -239,12 +239,12 @@ function extractCharacterEncodingFromMeta(string) {
239239
let position = 0;
240240

241241
while (true) {
242-
let subPosition = string.substring(position).search(/charset/i);
242+
const indexOfCharset = string.substring(position).search(/charset/i);
243243

244-
if (subPosition === -1) {
244+
if (indexOfCharset === -1) {
245245
return null;
246246
}
247-
subPosition += "charset".length;
247+
let subPosition = position + indexOfCharset + "charset".length;
248248

249249
while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
250250
++subPosition;
@@ -280,10 +280,11 @@ function extractCharacterEncodingFromMeta(string) {
280280
return null;
281281
}
282282

283-
let end = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
284-
if (end === -1) {
285-
end = string.length;
286-
}
283+
const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
284+
const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
285+
string.length :
286+
position + indexOfASCIIWhitespaceOrSemicolon + 1;
287+
287288
return whatwgEncoding.labelToName(string.substring(position, end));
288289
}
289290

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta http-equiv=Content-Type content="text/html;charsetcharset=iso-8859-2">
5+
</head>
6+
<body></body>
7+
</html>
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta http-equiv=Content-Type content="text/html;charset=iso-8859-2 ">
5+
</head>
6+
<body></body>
7+
</html>

test/tests.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,62 @@ describe("A file with no BOM and a <meta charset> preceeded by a short comment <
233233
});
234234
});
235235

236+
describe("A file with no BOM and a <meta http-equiv> ending with a trailing space", () => {
237+
const buffer = read("no-bom-charset-http-equiv-trailing-space.html");
238+
239+
it("should sniff as the charset value, given no options", () => {
240+
const sniffedEncoding = htmlEncodingSniffer(buffer);
241+
242+
assert.strictEqual(sniffedEncoding, "ISO-8859-2");
243+
});
244+
245+
it("should sniff as the transport layer encoding, given that", () => {
246+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
247+
transportLayerEncodingLabel: "windows-1251",
248+
defaultEncoding: "ISO-8859-16"
249+
});
250+
251+
assert.strictEqual(sniffedEncoding, "windows-1251");
252+
});
253+
254+
255+
it("should sniff as the charset value, given only a default encoding", () => {
256+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
257+
defaultEncoding: "ISO-8859-16"
258+
});
259+
260+
assert.strictEqual(sniffedEncoding, "ISO-8859-2");
261+
});
262+
});
263+
264+
describe("A file with no BOM and a <meta http-equiv> with 'charsetcharset'", () => {
265+
const buffer = read("no-bom-charset-http-equiv-second-charset.html");
266+
267+
it("should sniff as the charset value, given no options", () => {
268+
const sniffedEncoding = htmlEncodingSniffer(buffer);
269+
270+
assert.strictEqual(sniffedEncoding, "ISO-8859-2");
271+
});
272+
273+
it("should sniff as the transport layer encoding, given that", () => {
274+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
275+
transportLayerEncodingLabel: "windows-1251",
276+
defaultEncoding: "ISO-8859-16"
277+
});
278+
279+
assert.strictEqual(sniffedEncoding, "windows-1251");
280+
});
281+
282+
283+
it("should sniff as the charset value, given only a default encoding", () => {
284+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
285+
defaultEncoding: "ISO-8859-16"
286+
});
287+
288+
assert.strictEqual(sniffedEncoding, "ISO-8859-2");
289+
});
290+
});
291+
236292
for (const utf16Encoding of ["utf-16be", "utf-16", "utf-16le"]) {
237293
describe(`A file with a BOM and a <meta charset> of ${utf16Encoding}`, () => {
238294
const buffer = read(`no-bom-charset-${utf16Encoding}.html`);

0 commit comments

Comments
 (0)