@@ -48,7 +48,7 @@ mod tests {
4848 use datafusion_physical_plan:: { collect, ExecutionPlan } ;
4949
5050 use arrow:: array:: {
51- BooleanArray , Float64Array , Int32Array , RecordBatch , StringArray ,
51+ Array , BooleanArray , Float64Array , Int32Array , RecordBatch , StringArray ,
5252 } ;
5353 use arrow:: compute:: concat_batches;
5454 use arrow:: csv:: ReaderBuilder ;
@@ -1256,4 +1256,181 @@ mod tests {
12561256 . build_decoder ( ) ;
12571257 DecoderDeserializer :: new ( CsvDecoder :: new ( decoder) )
12581258 }
1259+
1260+ fn csv_deserializer_with_truncated (
1261+ batch_size : usize ,
1262+ schema : & Arc < Schema > ,
1263+ ) -> impl BatchDeserializer < Bytes > {
1264+ // using Arrow's ReaderBuilder and enabling truncated_rows
1265+ let decoder = ReaderBuilder :: new ( schema. clone ( ) )
1266+ . with_batch_size ( batch_size)
1267+ . with_truncated_rows ( true ) // <- enable runtime truncated_rows
1268+ . build_decoder ( ) ;
1269+ DecoderDeserializer :: new ( CsvDecoder :: new ( decoder) )
1270+ }
1271+
1272+ #[ tokio:: test]
1273+ async fn infer_schema_with_truncated_rows_true ( ) -> Result < ( ) > {
1274+ let session_ctx = SessionContext :: new ( ) ;
1275+ let state = session_ctx. state ( ) ;
1276+
1277+ // CSV: header has 3 columns, but first data row has only 2 columns, second row has 3
1278+ let csv_data = Bytes :: from ( "a,b,c\n 1,2\n 3,4,5\n " ) ;
1279+ let variable_object_store = Arc :: new ( VariableStream :: new ( csv_data, 1 ) ) ;
1280+ let object_meta = ObjectMeta {
1281+ location : Path :: parse ( "/" ) ?,
1282+ last_modified : DateTime :: default ( ) ,
1283+ size : u64:: MAX ,
1284+ e_tag : None ,
1285+ version : None ,
1286+ } ;
1287+
1288+ // Construct CsvFormat and enable truncated_rows via CsvOptions
1289+ let csv_options = CsvOptions :: default ( ) . with_truncated_rows ( true ) ;
1290+ let csv_format = CsvFormat :: default ( )
1291+ . with_has_header ( true )
1292+ . with_options ( csv_options)
1293+ . with_schema_infer_max_rec ( 10 ) ;
1294+
1295+ let inferred_schema = csv_format
1296+ . infer_schema (
1297+ & state,
1298+ & ( variable_object_store. clone ( ) as Arc < dyn ObjectStore > ) ,
1299+ & [ object_meta] ,
1300+ )
1301+ . await ?;
1302+
1303+ // header has 3 columns; inferred schema should also have 3
1304+ assert_eq ! ( inferred_schema. fields( ) . len( ) , 3 ) ;
1305+
1306+ // inferred columns should be nullable
1307+ for f in inferred_schema. fields ( ) {
1308+ assert ! ( f. is_nullable( ) ) ;
1309+ }
1310+
1311+ Ok ( ( ) )
1312+ }
1313+ #[ test]
1314+ fn test_decoder_truncated_rows_runtime ( ) -> Result < ( ) > {
1315+ // Synchronous test: Decoder API used here is synchronous
1316+ let schema = csv_schema ( ) ; // helper already defined in file
1317+
1318+ // Construct a decoder that enables truncated_rows at runtime
1319+ let mut deserializer = csv_deserializer_with_truncated ( 10 , & schema) ;
1320+
1321+ // Provide two rows: first row complete, second row missing last column
1322+ let input = Bytes :: from ( "0,0.0,true,0-string\n 1,1.0,true\n " ) ;
1323+ deserializer. digest ( input) ;
1324+
1325+ // Finish and collect output
1326+ deserializer. finish ( ) ;
1327+
1328+ let output = deserializer. next ( ) ?;
1329+ match output {
1330+ DeserializerOutput :: RecordBatch ( batch) => {
1331+ // ensure at least two rows present
1332+ assert ! ( batch. num_rows( ) >= 2 ) ;
1333+ // column 4 (index 3) should be a StringArray where second row is NULL
1334+ let col4 = batch
1335+ . column ( 3 )
1336+ . as_any ( )
1337+ . downcast_ref :: < StringArray > ( )
1338+ . expect ( "column 4 should be StringArray" ) ;
1339+
1340+ // first row present, second row should be null
1341+ assert ! ( !col4. is_null( 0 ) ) ;
1342+ assert ! ( col4. is_null( 1 ) ) ;
1343+ }
1344+ other => panic ! ( "expected RecordBatch but got {other:?}" ) ,
1345+ }
1346+ Ok ( ( ) )
1347+ }
1348+
1349+ #[ tokio:: test]
1350+ async fn infer_schema_truncated_rows_false_error ( ) -> Result < ( ) > {
1351+ let session_ctx = SessionContext :: new ( ) ;
1352+ let state = session_ctx. state ( ) ;
1353+
1354+ // CSV: header has 4 cols, first data row has 3 cols -> truncated at end
1355+ let csv_data = Bytes :: from ( "id,a,b,c\n 1,foo,bar\n 2,foo,bar,baz\n " ) ;
1356+ let variable_object_store = Arc :: new ( VariableStream :: new ( csv_data, 1 ) ) ;
1357+ let object_meta = ObjectMeta {
1358+ location : Path :: parse ( "/" ) ?,
1359+ last_modified : DateTime :: default ( ) ,
1360+ size : u64:: MAX ,
1361+ e_tag : None ,
1362+ version : None ,
1363+ } ;
1364+
1365+ // CsvFormat without enabling truncated_rows (default behavior = false)
1366+ let csv_format = CsvFormat :: default ( )
1367+ . with_has_header ( true )
1368+ . with_schema_infer_max_rec ( 10 ) ;
1369+
1370+ let res = csv_format
1371+ . infer_schema (
1372+ & state,
1373+ & ( variable_object_store. clone ( ) as Arc < dyn ObjectStore > ) ,
1374+ & [ object_meta] ,
1375+ )
1376+ . await ;
1377+
1378+ // Expect an error due to unequal lengths / incorrect number of fields
1379+ assert ! (
1380+ res. is_err( ) ,
1381+ "expected infer_schema to error on truncated rows when disabled"
1382+ ) ;
1383+
1384+ // Optional: check message contains indicative text (two known possibilities)
1385+ if let Err ( err) = res {
1386+ let msg = format ! ( "{err}" ) ;
1387+ assert ! (
1388+ msg. contains( "Encountered unequal lengths" )
1389+ || msg. contains( "incorrect number of fields" ) ,
1390+ "unexpected error message: {msg}" ,
1391+ ) ;
1392+ }
1393+
1394+ Ok ( ( ) )
1395+ }
1396+
1397+ #[ tokio:: test]
1398+ async fn test_read_csv_truncated_rows_via_tempfile ( ) -> Result < ( ) > {
1399+ use std:: io:: Write ;
1400+
1401+ // create a SessionContext
1402+ let ctx = SessionContext :: new ( ) ;
1403+
1404+ // Create a temp file with a .csv suffix so the reader accepts it
1405+ let mut tmp = tempfile:: Builder :: new ( ) . suffix ( ".csv" ) . tempfile ( ) ?; // ensures path ends with .csv
1406+ // CSV has header "a,b,c". First data row is truncated (only "1,2"), second row is complete.
1407+ write ! ( tmp, "a,b,c\n 1,2\n 3,4,5\n " ) ?;
1408+ let path = tmp. path ( ) . to_str ( ) . unwrap ( ) . to_string ( ) ;
1409+
1410+ // Build CsvReadOptions: header present, enable truncated_rows.
1411+ // (Use the exact builder method your crate exposes: `truncated_rows(true)` here,
1412+ // if the method name differs in your codebase use the appropriate one.)
1413+ let options = CsvReadOptions :: default ( ) . truncated_rows ( true ) ;
1414+
1415+ println ! ( "options: {}, path: {path}" , options. truncated_rows) ;
1416+
1417+ // Call the API under test
1418+ let df = ctx. read_csv ( & path, options) . await ?;
1419+
1420+ // Collect the results and combine batches so we can inspect columns
1421+ let batches = df. collect ( ) . await ?;
1422+ let combined = concat_batches ( & batches[ 0 ] . schema ( ) , & batches) ?;
1423+
1424+ // Column 'c' is the 3rd column (index 2). The first data row was truncated -> should be NULL.
1425+ let col_c = combined. column ( 2 ) ;
1426+ assert ! (
1427+ col_c. is_null( 0 ) ,
1428+ "expected first row column 'c' to be NULL due to truncated row"
1429+ ) ;
1430+
1431+ // Also ensure we read at least one row
1432+ assert ! ( combined. num_rows( ) >= 2 ) ;
1433+
1434+ Ok ( ( ) )
1435+ }
12591436}
0 commit comments