Skip to content

Commit 88f18aa

Browse files
committed
Parse read_nsrdb_psm4 header with csv module to keep quoted commas
read_nsrdb_psm4 split the three header lines with a naive str.split(','), which broke spectral-on-demand files whose column names are quoted fields containing commas (e.g. '"GaAs (Bauhuis et al., 2009)"'). Such names were split into spurious columns, raising on read. Parse the header lines with the csv module so quoted fields are kept intact. Fixes #2736
1 parent ba4c7c5 commit 88f18aa

3 files changed

Lines changed: 39 additions & 3 deletions

File tree

docs/sphinx/source/whatsnew/v0.15.2.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ Bug fixes
2626
represent the end of the averaging interval, consistent with ERA5
2727
conventions. (:issue:`2772`, :pull:`2773`)
2828

29+
* :py:func:`pvlib.iotools.read_nsrdb_psm4` now parses the file header with the
30+
:py:mod:`csv` module instead of a naive ``str.split(',')``, so quoted column
31+
names containing commas (e.g. the material names in spectral-on-demand files)
32+
are no longer split into spurious columns. (:issue:`2736`, :pull:`2771`)
2933

3034
Enhancements
3135
~~~~~~~~~~~~
@@ -67,6 +71,7 @@ Maintenance
6771
Contributors
6872
~~~~~~~~~~~~
6973
* :ghuser:`Omesh37`
74+
* :ghuser:`gaoflow`
7075
* Cliff Hansen (:ghuser:`cwhanse`)
7176
* :ghuser:`shethkajal7`
7277
* Arthur Onno (:ghuser:`ArthurOnnoTerabase`)

pvlib/iotools/psm4.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
https://developer.nlr.gov/docs/solar/nsrdb/nsrdb-GOES-full-disc-v4-0-0-download/
77
"""
88

9+
import csv
910
import io
1011
from urllib.parse import urljoin
1112
import requests
@@ -723,11 +724,16 @@ def read_nsrdb_psm4(filename, map_variables=True):
723724
<https://web.archive.org/web/20170207203107/https://sam.nrel.gov/sites/default/files/content/documents/pdf/wfcsv.pdf>`_
724725
"""
725726
with tools._file_context_manager(filename) as fbuf:
727+
# The first 3 header lines are parsed with the csv module rather than a
728+
# naive str.split(',') so that quoted fields containing commas are kept
729+
# intact. Spectral-on-demand files, for instance, have column names
730+
# like '"GaAs (Bauhuis et al., 2009)"' whose embedded commas would
731+
# otherwise be split into spurious columns (see GH #2736).
726732
# The first 2 lines of the response are headers with metadata
727-
metadata_fields = fbuf.readline().split(',')
728-
metadata_values = fbuf.readline().split(',')
733+
metadata_fields = next(csv.reader([fbuf.readline()]))
734+
metadata_values = next(csv.reader([fbuf.readline()]))
729735
# get the column names so we can set the dtypes
730-
columns = fbuf.readline().split(',')
736+
columns = next(csv.reader([fbuf.readline()]))
731737
columns[-1] = columns[-1].strip() # strip trailing newline
732738
# Since the header has so many columns, excel saves blank cols in the
733739
# data below the header lines.

tests/iotools/test_psm4.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,31 @@ def test_read_nsrdb_psm4_map_variables():
185185
assert_index_equal(data.columns, pd.Index(columns_mapped))
186186

187187

188+
def test_read_nsrdb_psm4_quoted_columns_with_commas():
189+
"""spectral-on-demand files have quoted column names containing commas;
190+
these must not be split into spurious columns (GH #2736)"""
191+
# Minimal NSRDB file whose column header (3rd line) has quoted material
192+
# names with embedded commas, which is valid CSV. A naive str.split(',')
193+
# would break these into extra columns and raise on read.
194+
content = (
195+
"Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,"
196+
"Elevation,Local Time Zone,Version\n"
197+
"NSRDB,1,-,-,-,40.0,-105.0,-7,1600,-7,4.0.1\n"
198+
'Year,Month,Day,Hour,Minute,GHI,"GaAs (Bauhuis et al., 2009)",'
199+
'"InGaP (Gray, 2008)"\n'
200+
"2023,1,1,0,0,0,0.1,0.2\n"
201+
"2023,1,1,1,0,5,0.3,0.4\n"
202+
)
203+
data, metadata = psm4.read_nsrdb_psm4(StringIO(content),
204+
map_variables=False)
205+
assert list(data.columns) == [
206+
'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI',
207+
'GaAs (Bauhuis et al., 2009)', 'InGaP (Gray, 2008)']
208+
assert data.shape == (2, 8)
209+
# the embedded-comma data columns round-trip as floats
210+
assert data['GaAs (Bauhuis et al., 2009)'].tolist() == [0.1, 0.3]
211+
212+
188213
@pytest.mark.remote_data
189214
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
190215
def test_get_nsrdb_psm4_aggregated_parameter_mapping(nlr_api_key):

0 commit comments

Comments
 (0)