BUG: read_hdf converting literal 'nan' string in Index to NaN (GH-9604) (#65603)

2026-05-30 01:03:43 +08:00 · 2026-05-29 09:32:57 -07:00
parent f5dedc407b
commit e0eec6e07c
3 changed files with 64 additions and 6 deletions
--- a/doc/source/whatsnew/v3.1.0.rst
+++ b/doc/source/whatsnew/v3.1.0.rst
@@ -351,6 +351,7 @@ I/O
 - Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
 - :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`)
 - Fixed ``MemoryError`` in :meth:`HDFStore.select` when iterating large tables with ``chunksize`` and no ``where`` filter (:issue:`15937`)
+- Fixed bug in :func:`read_hdf` where the literal string ``"nan"`` in a string :class:`Index` was incorrectly converted to ``NaN`` on read, even when a custom ``nan_rep`` was supplied (:issue:`9604`)
 - Fixed bug in :meth:`DataFrame.to_hdf` with ``format="table"`` where a :class:`TimedeltaIndex` was reconstructed as a :class:`PeriodIndex` (when ``freq`` was set) or an integer :class:`Index` (otherwise) on read-back (:issue:`21466`)
 - Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
 - Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`)
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -2927,8 +2927,13 @@ class DataCol(IndexCol):

        # convert nans / decode
        if kind == "string":
+            # Old files may have been written without nan_rep persisted; the
+            # writer (write_data) defaulted None to "nan", so do the same here.
            converted = _unconvert_string_array(
-                converted, nan_rep=nan_rep, encoding=encoding, errors=errors
+                converted,
+                nan_rep=nan_rep if nan_rep is not None else "nan",
+                encoding=encoding,
+                errors=errors,
            )

        return self.values, converted
@@ -5569,7 +5574,10 @@ def _unconvert_string_array(
    Parameters
    ----------
    data : np.ndarray[fixed-length-string]
-    nan_rep : the storage repr of NaN
+    nan_rep : the storage repr of NaN, or None to skip substitution.
+        Pass None when the writer did not encode NaN as a sentinel string
+        (e.g. for string indices); otherwise legitimate occurrences of the
+        sentinel value would be incorrectly replaced with NaN on read.
    encoding : str
    errors : str
        Handler for encoding errors.
@@ -5595,10 +5603,8 @@ def _unconvert_string_array(
        else:
            data = data.astype(dtype, copy=False).astype(object, copy=False)

-    if nan_rep is None:
-        nan_rep = "nan"
-
-    libwriters.string_array_replace_from_nan_rep(data, nan_rep)
+    if nan_rep is not None:
+        libwriters.string_array_replace_from_nan_rep(data, nan_rep)
    return data.reshape(shape)


--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -61,6 +61,57 @@ def test_long_strings(temp_hdfstore):
    tm.assert_frame_equal(df, result)


+def test_string_nan_in_index_fixed(temp_h5_path):
+    # GH#9604 — literal "nan" string in a string Index was being converted to
+    # NaN on read because the unconverter applied a NaN-sentinel substitution
+    # that the writer never performed for indices.
+    words = ["nan", "kai", "institute", "of", "technology"]
+    ser = Series(range(len(words)), index=words)
+
+    ser.to_hdf(temp_h5_path, key="s", mode="w")
+    result = read_hdf(temp_h5_path, "s")
+
+    tm.assert_series_equal(result, ser)
+
+
+def test_string_nan_in_index_table(temp_hdfstore):
+    # GH#9604 — same bug, table format. Also verifies that nan_rep is honored
+    # for the index (it was previously silently ignored on the index read).
+    words = ["nan", "kai", "institute", "of", "technology"]
+    ser = Series(range(len(words)), index=words)
+
+    temp_hdfstore.append("s", ser, nan_rep="_nan_")
+    result = temp_hdfstore.select("s")
+
+    tm.assert_series_equal(result, ser)
+
+
+def test_string_nan_in_dataframe_index(temp_h5_path):
+    # GH#9604 — DataFrame index with literal "nan" strings, both formats.
+    df = DataFrame(
+        {"a": [1, 2, 3, 4]},
+        index=Index(["nan", "kai", "institute", "of"], name="ix"),
+    )
+
+    df.to_hdf(temp_h5_path, key="fixed", mode="w")
+    tm.assert_frame_equal(read_hdf(temp_h5_path, "fixed"), df)
+
+    df.to_hdf(temp_h5_path, key="table", mode="a", format="table")
+    tm.assert_frame_equal(read_hdf(temp_h5_path, "table"), df)
+
+
+def test_string_column_literal_nan_and_real_nan(temp_hdfstore):
+    # GH#9604 — companion to the index tests: ensure the symmetric nan_rep
+    # substitution still works on a data column, i.e. a custom nan_rep lets
+    # both a literal "nan" string and an actual NaN round-trip correctly.
+    df = DataFrame({"a": ["x", "nan", np.nan, "y"]}, index=["i1", "i2", "i3", "i4"])
+
+    temp_hdfstore.append("df", df, nan_rep="_NA_")
+    result = temp_hdfstore.select("df")
+
+    tm.assert_frame_equal(result, df)
+
+
 def test_api(temp_h5_path):
    # GH4584
    # API issue when to_hdf doesn't accept append AND format args