mirror of
https://github.com/pandas-dev/pandas.git
synced 2026-05-30 01:03:43 +08:00
BUG: read_hdf converting literal 'nan' string in Index to NaN (GH-9604) (#65603)
This commit is contained in:
@@ -351,6 +351,7 @@ I/O
|
||||
- Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
|
||||
- :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`)
|
||||
- Fixed ``MemoryError`` in :meth:`HDFStore.select` when iterating large tables with ``chunksize`` and no ``where`` filter (:issue:`15937`)
|
||||
- Fixed bug in :func:`read_hdf` where the literal string ``"nan"`` in a string :class:`Index` was incorrectly converted to ``NaN`` on read, even when a custom ``nan_rep`` was supplied (:issue:`9604`)
|
||||
- Fixed bug in :meth:`DataFrame.to_hdf` with ``format="table"`` where a :class:`TimedeltaIndex` was reconstructed as a :class:`PeriodIndex` (when ``freq`` was set) or an integer :class:`Index` (otherwise) on read-back (:issue:`21466`)
|
||||
- Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
|
||||
- Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`)
|
||||
|
||||
@@ -2927,8 +2927,13 @@ class DataCol(IndexCol):
|
||||
|
||||
# convert nans / decode
|
||||
if kind == "string":
|
||||
# Old files may have been written without nan_rep persisted; the
|
||||
# writer (write_data) defaulted None to "nan", so do the same here.
|
||||
converted = _unconvert_string_array(
|
||||
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
|
||||
converted,
|
||||
nan_rep=nan_rep if nan_rep is not None else "nan",
|
||||
encoding=encoding,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
return self.values, converted
|
||||
@@ -5569,7 +5574,10 @@ def _unconvert_string_array(
|
||||
Parameters
|
||||
----------
|
||||
data : np.ndarray[fixed-length-string]
|
||||
nan_rep : the storage repr of NaN
|
||||
nan_rep : the storage repr of NaN, or None to skip substitution.
|
||||
Pass None when the writer did not encode NaN as a sentinel string
|
||||
(e.g. for string indices); otherwise legitimate occurrences of the
|
||||
sentinel value would be incorrectly replaced with NaN on read.
|
||||
encoding : str
|
||||
errors : str
|
||||
Handler for encoding errors.
|
||||
@@ -5595,10 +5603,8 @@ def _unconvert_string_array(
|
||||
else:
|
||||
data = data.astype(dtype, copy=False).astype(object, copy=False)
|
||||
|
||||
if nan_rep is None:
|
||||
nan_rep = "nan"
|
||||
|
||||
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
|
||||
if nan_rep is not None:
|
||||
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
|
||||
return data.reshape(shape)
|
||||
|
||||
|
||||
|
||||
@@ -61,6 +61,57 @@ def test_long_strings(temp_hdfstore):
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_string_nan_in_index_fixed(temp_h5_path):
|
||||
# GH#9604 — literal "nan" string in a string Index was being converted to
|
||||
# NaN on read because the unconverter applied a NaN-sentinel substitution
|
||||
# that the writer never performed for indices.
|
||||
words = ["nan", "kai", "institute", "of", "technology"]
|
||||
ser = Series(range(len(words)), index=words)
|
||||
|
||||
ser.to_hdf(temp_h5_path, key="s", mode="w")
|
||||
result = read_hdf(temp_h5_path, "s")
|
||||
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
def test_string_nan_in_index_table(temp_hdfstore):
|
||||
# GH#9604 — same bug, table format. Also verifies that nan_rep is honored
|
||||
# for the index (it was previously silently ignored on the index read).
|
||||
words = ["nan", "kai", "institute", "of", "technology"]
|
||||
ser = Series(range(len(words)), index=words)
|
||||
|
||||
temp_hdfstore.append("s", ser, nan_rep="_nan_")
|
||||
result = temp_hdfstore.select("s")
|
||||
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
def test_string_nan_in_dataframe_index(temp_h5_path):
|
||||
# GH#9604 — DataFrame index with literal "nan" strings, both formats.
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4]},
|
||||
index=Index(["nan", "kai", "institute", "of"], name="ix"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="fixed", mode="w")
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "fixed"), df)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="table", mode="a", format="table")
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "table"), df)
|
||||
|
||||
|
||||
def test_string_column_literal_nan_and_real_nan(temp_hdfstore):
|
||||
# GH#9604 — companion to the index tests: ensure the symmetric nan_rep
|
||||
# substitution still works on a data column, i.e. a custom nan_rep lets
|
||||
# both a literal "nan" string and an actual NaN round-trip correctly.
|
||||
df = DataFrame({"a": ["x", "nan", np.nan, "y"]}, index=["i1", "i2", "i3", "i4"])
|
||||
|
||||
temp_hdfstore.append("df", df, nan_rep="_NA_")
|
||||
result = temp_hdfstore.select("df")
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_api(temp_h5_path):
|
||||
# GH4584
|
||||
# API issue when to_hdf doesn't accept append AND format args
|
||||
|
||||
Reference in New Issue
Block a user