diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 61848cb127029..cb99f15775ab4 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -820,7 +820,7 @@ I/O
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
- :meth:`to_html` now excludes the ``border`` attribute from ``
`` elements when ``border`` keyword is set to ``False``.
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
--
+- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
Period
^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index d9912f2480e07..b8d56172027e1 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -668,7 +668,9 @@ class StataValueLabel:
Encoding to use for value labels.
"""
- def __init__(self, catarray: Series, encoding: str = "latin-1") -> None:
+ def __init__(
+ self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"
+ ) -> None:
if encoding not in ("latin-1", "utf-8"):
raise ValueError("Only latin-1 and utf-8 are supported.")
@@ -2250,7 +2252,7 @@ class StataWriter(StataParser):
"""
_max_string_length = 244
- _encoding = "latin-1"
+ _encoding: Literal["latin-1", "utf-8"] = "latin-1"
def __init__(
self,
@@ -2331,7 +2333,7 @@ def _prepare_non_cat_value_labels(
f"Can't create value labels for {labname}, value labels "
"can only be applied to numeric columns."
)
- svl = StataNonCatValueLabel(colname, labels)
+ svl = StataNonCatValueLabel(colname, labels, self._encoding)
non_cat_value_labels.append(svl)
return non_cat_value_labels
@@ -3575,7 +3577,7 @@ class StataWriterUTF8(StataWriter117):
>>> writer.write_file()
"""
- _encoding = "utf-8"
+ _encoding: Literal["utf-8"] = "utf-8"
def __init__(
self,
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index dafe1b4a3607d..377b8758c250e 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1797,6 +1797,7 @@ def test_utf8_writer(self, version):
"ᴐᴬᵀ": "",
}
data_label = "ᴅaᵀa-label"
+ value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}}
data["β"] = data["β"].astype(np.int32)
with tm.ensure_clean() as path:
writer = StataWriterUTF8(
@@ -1807,11 +1808,16 @@ def test_utf8_writer(self, version):
variable_labels=variable_labels,
write_index=False,
version=version,
+ value_labels=value_labels,
)
writer.write_file()
reread_encoded = read_stata(path)
# Missing is intentionally converted to empty strl
data["strls"] = data["strls"].fillna("")
+ # Variable with value labels is reread as categorical
+ data["β"] = (
+ data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered()
+ )
tm.assert_frame_equal(data, reread_encoded)
reader = StataReader(path)
assert reader.data_label == data_label