Skip to content

Commit f5d2e67

Browse files
committed
MAINT: Restrict use of iterator
Restrict iterator to StataReaders constructed with a positive chunksize
1 parent aef1622 commit f5d2e67

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

pandas/io/stata.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,10 @@ def __init__(
10211021
self._order_categoricals = order_categoricals
10221022
self._encoding = ""
10231023
self._chunksize = chunksize
1024+
if self._chunksize is not None and (
1025+
not isinstance(chunksize, int) or chunksize <= 0
1026+
):
1027+
raise ValueError("chunksize must be a positive integer when set.")
10241028

10251029
# State variables for the file
10261030
self._has_string_data = False
@@ -1486,6 +1490,10 @@ def _read_strls(self) -> None:
14861490
self.GSO[str(v_o)] = decoded_va
14871491

14881492
def __next__(self) -> DataFrame:
1493+
if self._chunksize is None:
1494+
raise ValueError(
1495+
"chunksize must be set to a positive integer to use as an iterator."
1496+
)
14891497
return self.read(nrows=self._chunksize or 1)
14901498

14911499
def get_chunk(self, size: Optional[int] = None) -> DataFrame:
@@ -1769,7 +1777,7 @@ def _do_convert_categoricals(
17691777
vl = value_label_dict[label]
17701778
keys = np.array([k for k in vl.keys()])
17711779
column = data[col]
1772-
if column.isin(keys).all() and self._chunksize:
1780+
if self._chunksize is not None and column.isin(keys).all():
17731781
# If all categories are in the keys and we are iterating,
17741782
# use the same keys for all chunks. If some are missing
17751783
# value labels, then we will fall back to the categories

pandas/tests/io/test_stata.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1895,3 +1895,15 @@ def test_chunked_categorical_partial(dirpath):
18951895
large_chunk = reader.__next__()
18961896
direct = read_stata(dta_file)
18971897
tm.assert_frame_equal(direct, large_chunk)
1898+
1899+
1900+
def test_iterator_errors(dirpath):
1901+
dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
1902+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1903+
StataReader(dta_file, chunksize=-1)
1904+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1905+
StataReader(dta_file, chunksize=0)
1906+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1907+
StataReader(dta_file, chunksize="apple")
1908+
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
1909+
StataReader(dta_file).__next__()

0 commit comments

Comments
 (0)