Skip to content

Commit 00f1e3f

Browse files
committed
Patch #534304: Implement phase 1 of PEP 263.
1 parent a729daf commit 00f1e3f

File tree

13 files changed

+656
-31
lines changed

13 files changed

+656
-31
lines changed

Doc/ref/ref2.tex

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ \chapter{Lexical analysis\label{lexical}}
77
\index{parser}
88
\index{token}
99

10-
Python uses the 7-bit \ASCII{} character set for program text and string
11-
literals. 8-bit characters may be used in string literals and comments
12-
but their interpretation is platform dependent; the proper way to
13-
insert 8-bit characters in string literals is by using octal or
14-
hexadecimal escape sequences.
10+
Python uses the 7-bit \ASCII{} character set for program text.
11+
\versionadded[An encoding declaration can be used to indicate that
12+
string literals and comments use an encoding different from ASCII.]{2.3}
13+
For compatibility with older versions, Python only warns if it finds
14+
8-bit characters; those warnings should be corrected by either declaring
15+
an explicit encoding, or using escape sequences if those bytes are binary
16+
data, instead of characters.
17+
1518

1619
The run-time character set depends on the I/O devices connected to the
1720
program but is generally a superset of \ASCII.
@@ -69,6 +72,37 @@ \subsection{Comments\label{comments}}
6972
\index{hash character}
7073

7174

75+
\subsection{Encoding declarations\label{encodings}}
76+
77+
If a comment in the first or second line of the Python script matches
78+
the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
79+
processed as an encoding declaration; the first group of this
80+
expression names the encoding of the source code file. The recommended
81+
forms of this expression are
82+
83+
\begin{verbatim}
84+
# -*- coding: <encoding-name> -*-
85+
\end{verbatim}
86+
87+
which is recognized also by GNU Emacs, and
88+
89+
\begin{verbatim}
90+
# vim:fileencoding=<encoding-name>
91+
\end{verbatim}
92+
93+
which is recognized by Bram Moolenar's VIM. In addition, if the first
94+
bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
95+
declared file encoding is UTF-8 (this is supported, among others, by
96+
Microsoft's notepad.exe).
97+
98+
If an encoding is declared, the encoding name must be recognized by
99+
Python. % XXX there should be a list of supported encodings.
100+
The encoding is used for all lexical analysis, in particular to find
101+
the end of a string, and to interpret the contents of Unicode literals.
102+
String literals are converted to Unicode for syntactical analysis,
103+
then converted back to their original encoding before interpretation
104+
starts.
105+
72106
\subsection{Explicit line joining\label{explicit-joining}}
73107

74108
Two or more physical lines may be joined into logical lines using

Grammar/Grammar

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
102102
list_if: 'if' test [list_iter]
103103

104104
testlist1: test (',' test)*
105+
106+
# not used in grammar, but may appear in "node" passed from Parser to Compiler
107+
encoding_decl: NAME

Include/errcode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ extern "C" {
2525
#define E_OVERFLOW 19 /* Node had too many children */
2626
#define E_TOODEEP 20 /* Too many indentation levels */
2727
#define E_DEDENT 21 /* No matching outer block for dedent */
28+
#define E_DECODE 22 /* Error in decoding into Unicode */
2829

2930
#ifdef __cplusplus
3031
}

Include/graminit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,4 @@
6565
#define list_for 320
6666
#define list_if 321
6767
#define testlist1 322
68+
#define encoding_decl 323

Makefile.pre.in

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,15 +190,15 @@ POBJS= \
190190
Parser/node.o \
191191
Parser/parser.o \
192192
Parser/parsetok.o \
193-
Parser/tokenizer.o \
194193
Parser/bitset.o \
195194
Parser/metagrammar.o
196195

197-
PARSER_OBJS= $(POBJS) Parser/myreadline.o
196+
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/tokenizer.o
198197

199198
PGOBJS= \
200199
Objects/obmalloc.o \
201200
Python/mysnprintf.o \
201+
Parser/tokenizer_pgen.o \
202202
Parser/firstsets.o \
203203
Parser/grammar.o \
204204
Parser/pgen.o \
@@ -434,6 +434,8 @@ Parser/grammar.o: $(srcdir)/Parser/grammar.c \
434434
$(srcdir)/Include/grammar.h
435435
Parser/metagrammar.o: $(srcdir)/Parser/metagrammar.c
436436

437+
Parser/tokenizer_pgen.o: $(srcdir)/Parser/tokenizer.c
438+
437439

438440
Python/compile.o Python/symtable.o: $(GRAMMAR_H)
439441

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Type/class unification and new-style classes
66

77
Core and builtins
88

9+
- Encoding declarations (PEP 263, phase 1) have been implemented.
10+
911
- list.sort() has a new implementation. While cross-platform results
1012
may vary, and in data-dependent ways, this is much faster on many
1113
kinds of partially ordered lists than the previous implementation,

Parser/parsetok.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "parser.h"
99
#include "parsetok.h"
1010
#include "errcode.h"
11+
#include "graminit.h"
1112

1213
int Py_TabcheckFlag;
1314

@@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
4546
return NULL;
4647
}
4748

49+
tok->filename = filename ? filename : "<string>";
4850
if (Py_TabcheckFlag || Py_VerboseFlag) {
49-
tok->filename = filename ? filename : "<string>";
5051
tok->altwarning = (tok->filename != NULL);
5152
if (Py_TabcheckFlag >= 2)
5253
tok->alterror++;
@@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
7879
err_ret->error = E_NOMEM;
7980
return NULL;
8081
}
82+
tok->filename = filename;
8183
if (Py_TabcheckFlag || Py_VerboseFlag) {
82-
tok->filename = filename;
8384
tok->altwarning = (filename != NULL);
8485
if (Py_TabcheckFlag >= 2)
8586
tok->alterror++;
@@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
185186
err_ret->text[len] = '\0';
186187
}
187188
}
189+
} else if (tok->encoding != NULL) {
190+
node* r = PyNode_New(encoding_decl);
191+
r->n_str = tok->encoding;
192+
r->n_nchildren = 1;
193+
r->n_child = n;
194+
tok->encoding = NULL;
195+
n = r;
188196
}
189197

190198
PyTokenizer_Free(tok);

0 commit comments

Comments
 (0)