From f94c157f9cb1ddbce5bd36b2697c7f91a9c8c0b0 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Sun, 23 Feb 2014 16:40:04 +1100
Subject: [PATCH 1/2] syntax: record multibyte chars' positions absolutely, not
 relative to file.

Previously multibyte UTF-8 chars were being recorded as byte offsets
from the start of the file, and then later compared against global byte
positions, resulting in the compiler possibly thinking it had a byte
position pointing inside a multibyte character, if there were multibyte
characters in any non-crate files. (Although, sometimes the byte offsets
line up just right to not ICE, but that was a coincidence.)

Fixes #11136.
Fixes #11178.
---
 src/libsyntax/parse/lexer.rs                  |  3 +-
 src/test/run-make/unicode-input/Makefile      |  6 +++
 .../run-make/unicode-input/multiple_files.rs  | 54 +++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 src/test/run-make/unicode-input/Makefile
 create mode 100644 src/test/run-make/unicode-input/multiple_files.rs

diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs
index b711e95bc943b..5bace75a5eace 100644
--- a/src/libsyntax/parse/lexer.rs
+++ b/src/libsyntax/parse/lexer.rs
@@ -264,8 +264,7 @@ pub fn bump(rdr: &StringReader) {
         }
 
         if byte_offset_diff > 1 {
-            rdr.filemap.record_multibyte_char(
-                Pos::from_uint(current_byte_offset), byte_offset_diff);
+            rdr.filemap.record_multibyte_char(rdr.last_pos.get(), byte_offset_diff);
         }
     } else {
         rdr.curr.set(None);
diff --git a/src/test/run-make/unicode-input/Makefile b/src/test/run-make/unicode-input/Makefile
new file mode 100644
index 0000000000000..1e420bddb777b
--- /dev/null
+++ b/src/test/run-make/unicode-input/Makefile
@@ -0,0 +1,6 @@
+-include ../tools.mk
+
+all:
+	# check that we don't ICE on unicode input, issue #11178
+	$(RUSTC) multiple_files.rs
+	$(call RUN,multiple_files)  "$(RUSTC)" "$(TMPDIR)"
diff --git a/src/test/run-make/unicode-input/multiple_files.rs b/src/test/run-make/unicode-input/multiple_files.rs
new file mode 100644
index 0000000000000..2758ac12bab1b
--- /dev/null
+++ b/src/test/run-make/unicode-input/multiple_files.rs
@@ -0,0 +1,54 @@
+use std::{char, os, run, str};
+use std::rand::{task_rng, Rng};
+use std::io::File;
+
+// creates unicode_input_multiple_files_{main,chars}.rs, where the
+// former imports the latter. `_chars` just contains an indentifier
+// made up of random characters, because will emit an error message
+// about the ident being in the wrong place, with a span (and creating
+// this span used to upset the compiler).
+
+fn random_char() -> char {
+    let mut rng = task_rng();
+    // a subset of the XID_start unicode table (ensuring that the
+    // compiler doesn't fail with an "unrecognised token" error)
+    let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
+        1 => (0x41, 0x5a),
+        2 => (0xf8, 0x1ba),
+        3 => (0x1401, 0x166c),
+        _ => (0x10400, 0x1044f)
+    };
+
+    char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
+}
+
+fn main() {
+    let args = os::args();
+    let rustc = args[1].as_slice();
+    let tmpdir = Path::new(args[2].as_slice());
+
+    let main_file = tmpdir.join("unicode_input_multiple_files_main.rs");
+    let main_file_str = main_file.as_str().unwrap();
+    {
+        let _ = File::create(&main_file).unwrap()
+            .write_str("mod unicode_input_multiple_files_chars;");
+    }
+
+    for _ in range(0, 100) {
+        {
+            let mut w = File::create(&tmpdir.join("unicode_input_multiple_files_chars.rs")).unwrap();
+            for _ in range(0, 30) {
+                let _ = w.write_char(random_char());
+            }
+        }
+
+        // rustc is passed to us with --out-dir and -L etc., so we
+        // can't exec it directly
+        let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
+        let err = str::from_utf8_lossy(result.error);
+
+        // positive test so that this test will be updated when the
+        // compiler changes.
+        assert!(err.as_slice().contains("expected item but found"))
+    }
+}

From 96ddb469ceaff5128e2b35183dbad1e181b7a82f Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Sun, 23 Feb 2014 17:08:46 +1100
Subject: [PATCH 2/2] syntax: calculate positions of multibyte characters more
 correctly.

They are still are not completely correct, since it does not handle
graphemes at all, just codepoints, but at least it handles the common
case correctly.

The calculation was previously very wrong (rather than just a little bit
wrong): it wasn't accounting for the fact that every character is 1
byte, and so multibyte characters were pretending to be zero width.

cc #8706
---
 src/libsyntax/codemap.rs                      |  7 ++-
 src/test/run-make/unicode-input/Makefile      |  5 ++
 .../run-make/unicode-input/multiple_files.rs  | 13 +++-
 .../run-make/unicode-input/span_length.rs     | 62 +++++++++++++++++++
 4 files changed, 83 insertions(+), 4 deletions(-)
 create mode 100644 src/test/run-make/unicode-input/span_length.rs

diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs
index 486a25fa775c9..3e235796af424 100644
--- a/src/libsyntax/codemap.rs
+++ b/src/libsyntax/codemap.rs
@@ -460,11 +460,12 @@ impl CodeMap {
         for mbc in multibyte_chars.get().iter() {
             debug!("codemap: {:?}-byte char at {:?}", mbc.bytes, mbc.pos);
             if mbc.pos < bpos {
-                total_extra_bytes += mbc.bytes;
+                // every character is at least one byte, so we only
+                // count the actual extra bytes.
+                total_extra_bytes += mbc.bytes - 1;
                 // We should never see a byte position in the middle of a
                 // character
-                assert!(bpos == mbc.pos ||
-                        bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
+                assert!(bpos.to_uint() >= mbc.pos.to_uint() + mbc.bytes);
             } else {
                 break;
             }
diff --git a/src/test/run-make/unicode-input/Makefile b/src/test/run-make/unicode-input/Makefile
index 1e420bddb777b..2d6ecd3c55efc 100644
--- a/src/test/run-make/unicode-input/Makefile
+++ b/src/test/run-make/unicode-input/Makefile
@@ -4,3 +4,8 @@ all:
 	# check that we don't ICE on unicode input, issue #11178
 	$(RUSTC) multiple_files.rs
 	$(call RUN,multiple_files)  "$(RUSTC)" "$(TMPDIR)"
+
+	# check that our multibyte-ident spans are (approximately) the
+	# correct length. issue #8706
+	$(RUSTC) span_length.rs
+	$(call RUN,span_length) "$(RUSTC)" "$(TMPDIR)"
diff --git a/src/test/run-make/unicode-input/multiple_files.rs b/src/test/run-make/unicode-input/multiple_files.rs
index 2758ac12bab1b..68bec1d215a27 100644
--- a/src/test/run-make/unicode-input/multiple_files.rs
+++ b/src/test/run-make/unicode-input/multiple_files.rs
@@ -1,3 +1,13 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
 use std::{char, os, run, str};
 use std::rand::{task_rng, Rng};
 use std::io::File;
@@ -36,7 +46,8 @@ fn main() {
 
     for _ in range(0, 100) {
         {
-            let mut w = File::create(&tmpdir.join("unicode_input_multiple_files_chars.rs")).unwrap();
+            let randoms = tmpdir.join("unicode_input_multiple_files_chars.rs");
+            let mut w = File::create(&randoms).unwrap();
             for _ in range(0, 30) {
                 let _ = w.write_char(random_char());
             }
diff --git a/src/test/run-make/unicode-input/span_length.rs b/src/test/run-make/unicode-input/span_length.rs
new file mode 100644
index 0000000000000..c437b70baf3fc
--- /dev/null
+++ b/src/test/run-make/unicode-input/span_length.rs
@@ -0,0 +1,62 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::{char, os, run, str};
+use std::rand::{task_rng, Rng};
+use std::io::File;
+
+// creates a file with `fn main() { <random ident> }` and checks the
+// compiler emits a span of the appropriate length (for the
+// "unresolved name" message); currently just using the number of code
+// points, but should be the number of graphemes (FIXME #7043)
+
+fn random_char() -> char {
+    let mut rng = task_rng();
+    // a subset of the XID_start unicode table (ensuring that the
+    // compiler doesn't fail with an "unrecognised token" error)
+    let (lo, hi): (u32, u32) = match rng.gen_range(1, 4 + 1) {
+        1 => (0x41, 0x5a),
+        2 => (0xf8, 0x1ba),
+        3 => (0x1401, 0x166c),
+        _ => (0x10400, 0x1044f)
+    };
+
+    char::from_u32(rng.gen_range(lo, hi + 1)).unwrap()
+}
+
+fn main() {
+    let args = os::args();
+    let rustc = args[1].as_slice();
+    let tmpdir = Path::new(args[2].as_slice());
+
+    let main_file = tmpdir.join("span_main.rs");
+    let main_file_str = main_file.as_str().unwrap();
+
+    for _ in range(0, 100) {
+        let n = task_rng().gen_range(3u, 20);
+
+        {
+            let _ = write!(&mut File::create(&main_file).unwrap(),
+                           r"\#[feature(non_ascii_idents)]; fn main() \{ {} \}",
+                           // random string of length n
+                           range(0, n).map(|_| random_char()).collect::<~str>());
+        }
+
+        // rustc is passed to us with --out-dir and -L etc., so we
+        // can't exec it directly
+        let result = run::process_output("sh", [~"-c", rustc + " " + main_file_str]).unwrap();
+
+        let err = str::from_utf8_lossy(result.error);
+
+        // the span should end the line (e.g no extra ~'s)
+        let expected_span = "^" + "~".repeat(n - 1) + "\n";
+        assert!(err.as_slice().contains(expected_span));
+    }
+}