BUG: read_hdf converting literal 'nan' string in Index to NaN (GH-9604) (#65603)

This commit is contained in:
jbrockmendel
2026-05-29 09:32:57 -07:00
committed by GitHub
parent f5dedc407b
commit e0eec6e07c
3 changed files with 64 additions and 6 deletions

View File

@@ -351,6 +351,7 @@ I/O
- Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
- :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`)
- Fixed ``MemoryError`` in :meth:`HDFStore.select` when iterating large tables with ``chunksize`` and no ``where`` filter (:issue:`15937`)
- Fixed bug in :func:`read_hdf` where the literal string ``"nan"`` in a string :class:`Index` was incorrectly converted to ``NaN`` on read, even when a custom ``nan_rep`` was supplied (:issue:`9604`)
- Fixed bug in :meth:`DataFrame.to_hdf` with ``format="table"`` where a :class:`TimedeltaIndex` was reconstructed as a :class:`PeriodIndex` (when ``freq`` was set) or an integer :class:`Index` (otherwise) on read-back (:issue:`21466`)
- Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
- Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`)

View File

@@ -2927,8 +2927,13 @@ class DataCol(IndexCol):
# convert nans / decode
if kind == "string":
# Old files may have been written without nan_rep persisted; the
# writer (write_data) defaulted None to "nan", so do the same here.
converted = _unconvert_string_array(
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
converted,
nan_rep=nan_rep if nan_rep is not None else "nan",
encoding=encoding,
errors=errors,
)
return self.values, converted
@@ -5569,7 +5574,10 @@ def _unconvert_string_array(
Parameters
----------
data : np.ndarray[fixed-length-string]
nan_rep : the storage repr of NaN
nan_rep : the storage repr of NaN, or None to skip substitution.
Pass None when the writer did not encode NaN as a sentinel string
(e.g. for string indices); otherwise legitimate occurrences of the
sentinel value would be incorrectly replaced with NaN on read.
encoding : str
errors : str
Handler for encoding errors.
@@ -5595,10 +5603,8 @@ def _unconvert_string_array(
else:
data = data.astype(dtype, copy=False).astype(object, copy=False)
if nan_rep is None:
nan_rep = "nan"
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
if nan_rep is not None:
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
return data.reshape(shape)

View File

@@ -61,6 +61,57 @@ def test_long_strings(temp_hdfstore):
tm.assert_frame_equal(df, result)
def test_string_nan_in_index_fixed(temp_h5_path):
# GH#9604 — literal "nan" string in a string Index was being converted to
# NaN on read because the unconverter applied a NaN-sentinel substitution
# that the writer never performed for indices.
words = ["nan", "kai", "institute", "of", "technology"]
ser = Series(range(len(words)), index=words)
ser.to_hdf(temp_h5_path, key="s", mode="w")
result = read_hdf(temp_h5_path, "s")
tm.assert_series_equal(result, ser)
def test_string_nan_in_index_table(temp_hdfstore):
# GH#9604 — same bug, table format. Also verifies that nan_rep is honored
# for the index (it was previously silently ignored on the index read).
words = ["nan", "kai", "institute", "of", "technology"]
ser = Series(range(len(words)), index=words)
temp_hdfstore.append("s", ser, nan_rep="_nan_")
result = temp_hdfstore.select("s")
tm.assert_series_equal(result, ser)
def test_string_nan_in_dataframe_index(temp_h5_path):
# GH#9604 — DataFrame index with literal "nan" strings, both formats.
df = DataFrame(
{"a": [1, 2, 3, 4]},
index=Index(["nan", "kai", "institute", "of"], name="ix"),
)
df.to_hdf(temp_h5_path, key="fixed", mode="w")
tm.assert_frame_equal(read_hdf(temp_h5_path, "fixed"), df)
df.to_hdf(temp_h5_path, key="table", mode="a", format="table")
tm.assert_frame_equal(read_hdf(temp_h5_path, "table"), df)
def test_string_column_literal_nan_and_real_nan(temp_hdfstore):
# GH#9604 — companion to the index tests: ensure the symmetric nan_rep
# substitution still works on a data column, i.e. a custom nan_rep lets
# both a literal "nan" string and an actual NaN round-trip correctly.
df = DataFrame({"a": ["x", "nan", np.nan, "y"]}, index=["i1", "i2", "i3", "i4"])
temp_hdfstore.append("df", df, nan_rep="_NA_")
result = temp_hdfstore.select("df")
tm.assert_frame_equal(result, df)
def test_api(temp_h5_path):
# GH4584
# API issue when to_hdf doesn't accept append AND format args