From eee2e57b06233f29ef101afa64d4ec33a45f04ec Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 20 Oct 2024 13:18:59 +0200 Subject: [PATCH 1/5] TST: enable csv tests for use_arrow --- pyogrio/tests/test_geopandas_io.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0675c197..e71a13c5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -94,7 +94,7 @@ def spatialite_available(path): @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None]) -def test_read_csv_encoding(tmp_path, encoding): +def test_read_csv_encoding(tmp_path, encoding, use_arrow): # Write csv test file. Depending on the os this will be written in a different # encoding: for linux and macos this is utf-8, for windows it is cp1252. csv_path = tmp_path / "test.csv" @@ -105,7 +105,7 @@ def test_read_csv_encoding(tmp_path, encoding): # Read csv. The data should be read with the same default encoding as the csv file # was written in, but should have been converted to utf-8 in the dataframe returned. # Hence, the asserts below, with strings in utf-8, be OK. - df = read_dataframe(csv_path, encoding=encoding) + df = read_dataframe(csv_path, encoding=encoding, use_arrow=use_arrow) assert len(df) == 1 assert df.columns.tolist() == ["näme", "city"] @@ -117,14 +117,14 @@ def test_read_csv_encoding(tmp_path, encoding): locale.getpreferredencoding().upper() == "UTF-8", reason="test requires non-UTF-8 default platform", ) -def test_read_csv_platform_encoding(tmp_path): +def test_read_csv_platform_encoding(tmp_path, use_arrow): """verify that read defaults to platform encoding; only works on Windows (CP1252)""" csv_path = tmp_path / "test.csv" with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv: csv.write("näme,city\n") csv.write("Wilhelm Röntgen,Zürich\n") - df = read_dataframe(csv_path) + df = read_dataframe(csv_path, use_arrow=use_arrow) assert len(df) == 1 assert df.columns.tolist() == ["näme", "city"] @@ -944,7 +944,8 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow): @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None]) -def test_write_csv_encoding(tmp_path, encoding): +@pytest.mark.requires_arrow_write_api +def test_write_csv_encoding(tmp_path, encoding, use_arrow): """Test if write_dataframe uses the default encoding correctly.""" # Write csv test file. Depending on the os this will be written in a different # encoding: for linux and macos this is utf-8, for windows it is cp1252. @@ -958,7 +959,7 @@ def test_write_csv_encoding(tmp_path, encoding): # same encoding as above. df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]}) csv_pyogrio_path = tmp_path / "test_pyogrio.csv" - write_dataframe(df, csv_pyogrio_path, encoding=encoding) + write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=use_arrow) # Check if the text files written both ways can be read again and give same result. with open(csv_path, encoding=encoding) as csv: From 4165e6a841b562b6e32f8cadc3dbaa4df1e546e0 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 20 Oct 2024 14:00:13 +0200 Subject: [PATCH 2/5] Only for utf-8 --- pyogrio/tests/test_geopandas_io.py | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e71a13c5..e079b7a5 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -93,8 +93,20 @@ def spatialite_available(path): return False -@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None]) -def test_read_csv_encoding(tmp_path, encoding, use_arrow): +@pytest.mark.parametrize( + "encoding, arrow", + [ + ("utf-8", False), + pytest.param("utf-8", True, marks=requires_pyarrow_api), + ("cp1252", False), + (None, False), + ], +) +def test_read_csv_encoding(tmp_path, encoding, arrow): + """ "Test reading CSV files with different encodings. + + Arrow only supports utf-8 encoding. + """ # Write csv test file. Depending on the os this will be written in a different # encoding: for linux and macos this is utf-8, for windows it is cp1252. csv_path = tmp_path / "test.csv" @@ -105,7 +117,7 @@ def test_read_csv_encoding(tmp_path, encoding, use_arrow): # Read csv. The data should be read with the same default encoding as the csv file # was written in, but should have been converted to utf-8 in the dataframe returned. # Hence, the asserts below, with strings in utf-8, be OK. - df = read_dataframe(csv_path, encoding=encoding, use_arrow=use_arrow) + df = read_dataframe(csv_path, encoding=encoding, use_arrow=arrow) assert len(df) == 1 assert df.columns.tolist() == ["näme", "city"] @@ -943,10 +955,20 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow): assert df.iloc[0].geometry.area > area_canada -@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None]) -@pytest.mark.requires_arrow_write_api -def test_write_csv_encoding(tmp_path, encoding, use_arrow): - """Test if write_dataframe uses the default encoding correctly.""" +@pytest.mark.parametrize( + "encoding, arrow", + [ + ("utf-8", False), + pytest.param("utf-8", True, marks=requires_arrow_write_api), + ("cp1252", False), + (None, False), + ], +) +def test_write_csv_encoding(tmp_path, encoding, arrow): + """Test if write_dataframe uses the default encoding correctly. + + Arrow only supports utf-8 encoding. + """ # Write csv test file. Depending on the os this will be written in a different # encoding: for linux and macos this is utf-8, for windows it is cp1252. csv_path = tmp_path / "test.csv" @@ -959,7 +981,7 @@ def test_write_csv_encoding(tmp_path, encoding, use_arrow): # same encoding as above. df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]}) csv_pyogrio_path = tmp_path / "test_pyogrio.csv" - write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=use_arrow) + write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=arrow) # Check if the text files written both ways can be read again and give same result. with open(csv_path, encoding=encoding) as csv: From 24c8a06e71805f56c3b9ec70d70e13badc589257 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 22:46:04 +0100 Subject: [PATCH 3/5] Throw a specific an clear error --- pyogrio/geopandas.py | 10 +++++++++- pyogrio/tests/test_geopandas_io.py | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 11672b25..d1f0dc9d 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -289,7 +289,15 @@ def read_dataframe( kwargs = {"self_destruct": True} if arrow_to_pandas_kwargs is not None: kwargs.update(arrow_to_pandas_kwargs) - df = table.to_pandas(**kwargs) + + try: + df = table.to_pandas(**kwargs) + except UnicodeDecodeError as ex: + # Arrow does not support reading data in a non-UTF-8 encoding + raise DataSourceError( + "The file being read is not encoded in UTF-8; please use_arrow=False" + ) from ex + del table if fid_as_index: diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index e079b7a5..3d85d1f1 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -130,13 +130,24 @@ def test_read_csv_encoding(tmp_path, encoding, arrow): reason="test requires non-UTF-8 default platform", ) def test_read_csv_platform_encoding(tmp_path, use_arrow): - """verify that read defaults to platform encoding; only works on Windows (CP1252)""" + """Verify that read defaults to platform encoding; only works on Windows (CP1252). + + When use_arrow=True, reading an non-UTF8 fails. + """ csv_path = tmp_path / "test.csv" with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv: csv.write("näme,city\n") csv.write("Wilhelm Röntgen,Zürich\n") - df = read_dataframe(csv_path, use_arrow=use_arrow) + if use_arrow: + handler = pytest.raises( + DataSourceError, match="The file being read is not encoded in UTF-8" + ) + else: + handler = contextlib.nullcontext() + + with handler: + df = read_dataframe(csv_path, use_arrow=use_arrow) assert len(df) == 1 assert df.columns.tolist() == ["näme", "city"] From 428b32c12a60fc9ca347e0cbc05b42a2da9b16fe Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 23:02:27 +0100 Subject: [PATCH 4/5] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3d85d1f1..b08d8f4d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -140,19 +140,18 @@ def test_read_csv_platform_encoding(tmp_path, use_arrow): csv.write("Wilhelm Röntgen,Zürich\n") if use_arrow: - handler = pytest.raises( - DataSourceError, match="The file being read is not encoded in UTF-8" - ) + with pytest.raises( + DataSourceError, + match="; please use_arrow=False", + ): + df = read_dataframe(csv_path, use_arrow=use_arrow) else: - handler = contextlib.nullcontext() - - with handler: df = read_dataframe(csv_path, use_arrow=use_arrow) - assert len(df) == 1 - assert df.columns.tolist() == ["näme", "city"] - assert df.city.tolist() == ["Zürich"] - assert df.näme.tolist() == ["Wilhelm Röntgen"] + assert len(df) == 1 + assert df.columns.tolist() == ["näme", "city"] + assert df.city.tolist() == ["Zürich"] + assert df.näme.tolist() == ["Wilhelm Röntgen"] def test_read_dataframe(naturalearth_lowres_all_ext): @@ -2194,7 +2193,10 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow): if use_arrow: # pyarrow cannot decode column name with incorrect encoding - with pytest.raises(UnicodeDecodeError): + with pytest.raises( + DataSourceError, + match="The file being read is not encoded in UTF-8; please use_arrow=False", + ): read_dataframe(output_path, use_arrow=True) else: bad = read_dataframe(output_path, use_arrow=False) From f4f18882268e2b623cd350771a37af6a24a28797 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 23:21:06 +0100 Subject: [PATCH 5/5] Update CHANGES.md --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index ac51e8c7..14741ab6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,8 @@ ### Improvements - Add support to read, write, list, and remove `/vsimem/` files (#457). +- Raise specific error when trying to read non-UTF-8 file with + `use_arrow=True` (#490). ### Bug fixes