From eee2e57b06233f29ef101afa64d4ec33a45f04ec Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sun, 20 Oct 2024 13:18:59 +0200
Subject: [PATCH 1/5] TST: enable csv tests for use_arrow

---
 pyogrio/tests/test_geopandas_io.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 0675c197..e71a13c5 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -94,7 +94,7 @@ def spatialite_available(path):
 
 
 @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-def test_read_csv_encoding(tmp_path, encoding):
+def test_read_csv_encoding(tmp_path, encoding, use_arrow):
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
     csv_path = tmp_path / "test.csv"
@@ -105,7 +105,7 @@ def test_read_csv_encoding(tmp_path, encoding):
     # Read csv. The data should be read with the same default encoding as the csv file
     # was written in, but should have been converted to utf-8 in the dataframe returned.
     # Hence, the asserts below, with strings in utf-8, be OK.
-    df = read_dataframe(csv_path, encoding=encoding)
+    df = read_dataframe(csv_path, encoding=encoding, use_arrow=use_arrow)
 
     assert len(df) == 1
     assert df.columns.tolist() == ["näme", "city"]
@@ -117,14 +117,14 @@ def test_read_csv_encoding(tmp_path, encoding):
     locale.getpreferredencoding().upper() == "UTF-8",
     reason="test requires non-UTF-8 default platform",
 )
-def test_read_csv_platform_encoding(tmp_path):
+def test_read_csv_platform_encoding(tmp_path, use_arrow):
     """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
     csv_path = tmp_path / "test.csv"
     with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
         csv.write("näme,city\n")
         csv.write("Wilhelm Röntgen,Zürich\n")
 
-    df = read_dataframe(csv_path)
+    df = read_dataframe(csv_path, use_arrow=use_arrow)
 
     assert len(df) == 1
     assert df.columns.tolist() == ["näme", "city"]
@@ -944,7 +944,8 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
 
 
 @pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-def test_write_csv_encoding(tmp_path, encoding):
+@pytest.mark.requires_arrow_write_api
+def test_write_csv_encoding(tmp_path, encoding, use_arrow):
     """Test if write_dataframe uses the default encoding correctly."""
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
@@ -958,7 +959,7 @@ def test_write_csv_encoding(tmp_path, encoding):
     # same encoding as above.
     df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
     csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
-    write_dataframe(df, csv_pyogrio_path, encoding=encoding)
+    write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=use_arrow)
 
     # Check if the text files written both ways can be read again and give same result.
     with open(csv_path, encoding=encoding) as csv:

From 4165e6a841b562b6e32f8cadc3dbaa4df1e546e0 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sun, 20 Oct 2024 14:00:13 +0200
Subject: [PATCH 2/5] Only for utf-8

---
 pyogrio/tests/test_geopandas_io.py | 38 +++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index e71a13c5..e079b7a5 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -93,8 +93,20 @@ def spatialite_available(path):
         return False
 
 
-@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-def test_read_csv_encoding(tmp_path, encoding, use_arrow):
+@pytest.mark.parametrize(
+    "encoding, arrow",
+    [
+        ("utf-8", False),
+        pytest.param("utf-8", True, marks=requires_pyarrow_api),
+        ("cp1252", False),
+        (None, False),
+    ],
+)
+def test_read_csv_encoding(tmp_path, encoding, arrow):
+    """ "Test reading CSV files with different encodings.
+
+    Arrow only supports utf-8 encoding.
+    """
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
     csv_path = tmp_path / "test.csv"
@@ -105,7 +117,7 @@ def test_read_csv_encoding(tmp_path, encoding, use_arrow):
     # Read csv. The data should be read with the same default encoding as the csv file
     # was written in, but should have been converted to utf-8 in the dataframe returned.
     # Hence, the asserts below, with strings in utf-8, be OK.
-    df = read_dataframe(csv_path, encoding=encoding, use_arrow=use_arrow)
+    df = read_dataframe(csv_path, encoding=encoding, use_arrow=arrow)
 
     assert len(df) == 1
     assert df.columns.tolist() == ["näme", "city"]
@@ -943,10 +955,20 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
     assert df.iloc[0].geometry.area > area_canada
 
 
-@pytest.mark.parametrize("encoding", ["utf-8", "cp1252", None])
-@pytest.mark.requires_arrow_write_api
-def test_write_csv_encoding(tmp_path, encoding, use_arrow):
-    """Test if write_dataframe uses the default encoding correctly."""
+@pytest.mark.parametrize(
+    "encoding, arrow",
+    [
+        ("utf-8", False),
+        pytest.param("utf-8", True, marks=requires_arrow_write_api),
+        ("cp1252", False),
+        (None, False),
+    ],
+)
+def test_write_csv_encoding(tmp_path, encoding, arrow):
+    """Test if write_dataframe uses the default encoding correctly.
+
+    Arrow only supports utf-8 encoding.
+    """
     # Write csv test file. Depending on the os this will be written in a different
     # encoding: for linux and macos this is utf-8, for windows it is cp1252.
     csv_path = tmp_path / "test.csv"
@@ -959,7 +981,7 @@ def test_write_csv_encoding(tmp_path, encoding, use_arrow):
     # same encoding as above.
     df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
     csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
-    write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=use_arrow)
+    write_dataframe(df, csv_pyogrio_path, encoding=encoding, use_arrow=arrow)
 
     # Check if the text files written both ways can be read again and give same result.
     with open(csv_path, encoding=encoding) as csv:

From 24c8a06e71805f56c3b9ec70d70e13badc589257 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sun, 26 Jan 2025 22:46:04 +0100
Subject: [PATCH 3/5] Throw a specific an clear error

---
 pyogrio/geopandas.py               | 10 +++++++++-
 pyogrio/tests/test_geopandas_io.py | 15 +++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
index 11672b25..d1f0dc9d 100644
--- a/pyogrio/geopandas.py
+++ b/pyogrio/geopandas.py
@@ -289,7 +289,15 @@ def read_dataframe(
         kwargs = {"self_destruct": True}
         if arrow_to_pandas_kwargs is not None:
             kwargs.update(arrow_to_pandas_kwargs)
-        df = table.to_pandas(**kwargs)
+
+        try:
+            df = table.to_pandas(**kwargs)
+        except UnicodeDecodeError as ex:
+            # Arrow does not support reading data in a non-UTF-8 encoding
+            raise DataSourceError(
+                "The file being read is not encoded in UTF-8; please use_arrow=False"
+            ) from ex
+
         del table
 
         if fid_as_index:
diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index e079b7a5..3d85d1f1 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -130,13 +130,24 @@ def test_read_csv_encoding(tmp_path, encoding, arrow):
     reason="test requires non-UTF-8 default platform",
 )
 def test_read_csv_platform_encoding(tmp_path, use_arrow):
-    """verify that read defaults to platform encoding; only works on Windows (CP1252)"""
+    """Verify that read defaults to platform encoding; only works on Windows (CP1252).
+
+    When use_arrow=True, reading an non-UTF8 fails.
+    """
     csv_path = tmp_path / "test.csv"
     with open(csv_path, "w", encoding=locale.getpreferredencoding()) as csv:
         csv.write("näme,city\n")
         csv.write("Wilhelm Röntgen,Zürich\n")
 
-    df = read_dataframe(csv_path, use_arrow=use_arrow)
+    if use_arrow:
+        handler = pytest.raises(
+            DataSourceError, match="The file being read is not encoded in UTF-8"
+        )
+    else:
+        handler = contextlib.nullcontext()
+
+    with handler:
+        df = read_dataframe(csv_path, use_arrow=use_arrow)
 
     assert len(df) == 1
     assert df.columns.tolist() == ["näme", "city"]

From 428b32c12a60fc9ca347e0cbc05b42a2da9b16fe Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sun, 26 Jan 2025 23:02:27 +0100
Subject: [PATCH 4/5] Update test_geopandas_io.py

---
 pyogrio/tests/test_geopandas_io.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
index 3d85d1f1..b08d8f4d 100644
--- a/pyogrio/tests/test_geopandas_io.py
+++ b/pyogrio/tests/test_geopandas_io.py
@@ -140,19 +140,18 @@ def test_read_csv_platform_encoding(tmp_path, use_arrow):
         csv.write("Wilhelm Röntgen,Zürich\n")
 
     if use_arrow:
-        handler = pytest.raises(
-            DataSourceError, match="The file being read is not encoded in UTF-8"
-        )
+        with pytest.raises(
+            DataSourceError,
+            match="; please use_arrow=False",
+        ):
+            df = read_dataframe(csv_path, use_arrow=use_arrow)
     else:
-        handler = contextlib.nullcontext()
-
-    with handler:
         df = read_dataframe(csv_path, use_arrow=use_arrow)
 
-    assert len(df) == 1
-    assert df.columns.tolist() == ["näme", "city"]
-    assert df.city.tolist() == ["Zürich"]
-    assert df.näme.tolist() == ["Wilhelm Röntgen"]
+        assert len(df) == 1
+        assert df.columns.tolist() == ["näme", "city"]
+        assert df.city.tolist() == ["Zürich"]
+        assert df.näme.tolist() == ["Wilhelm Röntgen"]
 
 
 def test_read_dataframe(naturalearth_lowres_all_ext):
@@ -2194,7 +2193,10 @@ def test_non_utf8_encoding_io_shapefile(tmp_path, encoded_text, use_arrow):
 
     if use_arrow:
         # pyarrow cannot decode column name with incorrect encoding
-        with pytest.raises(UnicodeDecodeError):
+        with pytest.raises(
+            DataSourceError,
+            match="The file being read is not encoded in UTF-8; please use_arrow=False",
+        ):
             read_dataframe(output_path, use_arrow=True)
     else:
         bad = read_dataframe(output_path, use_arrow=False)

From f4f18882268e2b623cd350771a37af6a24a28797 Mon Sep 17 00:00:00 2001
From: Pieter Roggemans <pieter.roggemans@gmail.com>
Date: Sun, 26 Jan 2025 23:21:06 +0100
Subject: [PATCH 5/5] Update CHANGES.md

---
 CHANGES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index ac51e8c7..14741ab6 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -5,6 +5,8 @@
 ### Improvements
 
 -   Add support to read, write, list, and remove `/vsimem/` files (#457).
+-   Raise specific error when trying to read non-UTF-8 file with
+    `use_arrow=True` (#490).
 
 ### Bug fixes