Parse read_nsrdb_psm4 header with csv module to keep quoted commas

gaoflow · gaoflow · commit 88f18aa3be87 · 2026-06-08T15:42:03.000+02:00
read_nsrdb_psm4 split the three header lines with a naive str.split(','), which broke spectral-on-demand files whose column names are quoted fields containing commas (e.g. '"GaAs (Bauhuis et al., 2009)"'). Such names were split into spurious columns, raising on read. Parse the header lines with the csv module so quoted fields are kept intact. Fixes #2736
diff --git a/docs/sphinx/source/whatsnew/v0.15.2.rst b/docs/sphinx/source/whatsnew/v0.15.2.rst
@@ -26,6 +26,10 @@ Bug fixes
   represent the end of the averaging interval, consistent with ERA5
   conventions. (:issue:`2772`, :pull:`2773`)
 
+* :py:func:`pvlib.iotools.read_nsrdb_psm4` now parses the file header with the
+  :py:mod:`csv` module instead of a naive ``str.split(',')``, so quoted column
+  names containing commas (e.g. the material names in spectral-on-demand files)
+  are no longer split into spurious columns. (:issue:`2736`, :pull:`2771`)
 
 Enhancements
 ~~~~~~~~~~~~
@@ -67,6 +71,7 @@ Maintenance
 Contributors
 ~~~~~~~~~~~~
 * :ghuser:`Omesh37`
+* :ghuser:`gaoflow`
 * Cliff Hansen (:ghuser:`cwhanse`)
 * :ghuser:`shethkajal7`
 * Arthur Onno (:ghuser:`ArthurOnnoTerabase`)
diff --git a/pvlib/iotools/psm4.py b/pvlib/iotools/psm4.py
@@ -6,6 +6,7 @@
 https://developer.nlr.gov/docs/solar/nsrdb/nsrdb-GOES-full-disc-v4-0-0-download/
 """
 
+import csv
 import io
 from urllib.parse import urljoin
 import requests
@@ -723,11 +724,16 @@ def read_nsrdb_psm4(filename, map_variables=True):
        <https://web.archive.org/web/20170207203107/https://sam.nrel.gov/sites/default/files/content/documents/pdf/wfcsv.pdf>`_
     """
     with tools._file_context_manager(filename) as fbuf:
+        # The first 3 header lines are parsed with the csv module rather than a
+        # naive str.split(',') so that quoted fields containing commas are kept
+        # intact.  Spectral-on-demand files, for instance, have column names
+        # like '"GaAs (Bauhuis et al., 2009)"' whose embedded commas would
+        # otherwise be split into spurious columns (see GH #2736).
         # The first 2 lines of the response are headers with metadata
-        metadata_fields = fbuf.readline().split(',')
-        metadata_values = fbuf.readline().split(',')
+        metadata_fields = next(csv.reader([fbuf.readline()]))
+        metadata_values = next(csv.reader([fbuf.readline()]))
         # get the column names so we can set the dtypes
-        columns = fbuf.readline().split(',')
+        columns = next(csv.reader([fbuf.readline()]))
         columns[-1] = columns[-1].strip()  # strip trailing newline
         # Since the header has so many columns, excel saves blank cols in the
         # data below the header lines.
diff --git a/tests/iotools/test_psm4.py b/tests/iotools/test_psm4.py
@@ -185,6 +185,31 @@ def test_read_nsrdb_psm4_map_variables():
     assert_index_equal(data.columns, pd.Index(columns_mapped))
 
 
+def test_read_nsrdb_psm4_quoted_columns_with_commas():
+    """spectral-on-demand files have quoted column names containing commas;
+    these must not be split into spurious columns (GH #2736)"""
+    # Minimal NSRDB file whose column header (3rd line) has quoted material
+    # names with embedded commas, which is valid CSV. A naive str.split(',')
+    # would break these into extra columns and raise on read.
+    content = (
+        "Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,"
+        "Elevation,Local Time Zone,Version\n"
+        "NSRDB,1,-,-,-,40.0,-105.0,-7,1600,-7,4.0.1\n"
+        'Year,Month,Day,Hour,Minute,GHI,"GaAs (Bauhuis et al., 2009)",'
+        '"InGaP (Gray, 2008)"\n'
+        "2023,1,1,0,0,0,0.1,0.2\n"
+        "2023,1,1,1,0,5,0.3,0.4\n"
+    )
+    data, metadata = psm4.read_nsrdb_psm4(StringIO(content),
+                                          map_variables=False)
+    assert list(data.columns) == [
+        'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI',
+        'GaAs (Bauhuis et al., 2009)', 'InGaP (Gray, 2008)']
+    assert data.shape == (2, 8)
+    # the embedded-comma data columns round-trip as floats
+    assert data['GaAs (Bauhuis et al., 2009)'].tolist() == [0.1, 0.3]
+
+
 @pytest.mark.remote_data
 @pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
 def test_get_nsrdb_psm4_aggregated_parameter_mapping(nlr_api_key):