Add JSON import loading API-compatible drop-in JSON modules

sebastian-nagel · sebastian-nagel · commit 7f50c4998c61 · 2025-05-27T12:46:21.000+02:00
- try to load the most performant module first: - "orjson" (most performant drop-in replacement, cf. #41) - if loading fails fall back to: - "ujson" ("UltraJSON", proved since the beginning of cc-pyspark) - "json" (Python Standard Library)
diff --git a/json_importer.py b/json_importer.py
@@ -0,0 +1,12 @@
+"""Import JSON modules with drop-in compatible API,
+   trying modules with faster JSON parsers first: orjson, ujson, json
+   Cf. https://github.com/commoncrawl/cc-pyspark/issues/41
+"""
+
+try:
+    import orjson as json
+except ImportError:
+    try:
+        import ujson as json
+    except ImportError:
+        import json
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ botocore
 boto3
 requests
 ujson
+orjson
 warcio
 
 # for link extraction and webgraph construction also:
diff --git a/server_count.py b/server_count.py
@@ -1,6 +1,5 @@
-import ujson as json
-
 from sparkcc import CCSparkJob
+from json_importer import json
 
 
 class ServerCountJob(CCSparkJob):
diff --git a/wat_extract_links.py b/wat_extract_links.py
@@ -2,13 +2,12 @@
 import os
 import re
 
-import ujson as json
-
 from urllib.parse import urljoin, urlparse
 
 from pyspark.sql.types import StructType, StructField, StringType
 
 from sparkcc import CCSparkJob
+from json_importer import json
 
 
 class ExtractLinksJob(CCSparkJob):