15
15
16
16
17
17
class SparkNLPReader (ExtendedJavaWrapper ):
18
- """Instantiates class to read HTML files.
18
+ """Instantiates class to read HTML, email, and document files.
19
19
20
- Two types of input paths are supported,
20
+ Two types of input paths are supported:
21
21
22
- htmlPath: this is a path to a directory of HTML files or a path to an HTML file
23
- E.g. "path/html/files"
24
-
25
- url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
22
+ - `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`).
23
+ - `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`).
26
24
27
25
Parameters
28
26
----------
29
- params : spark
30
- Spark session
27
+ spark : SparkSession
28
+ The active Spark session.
31
29
params : dict, optional
32
- Parameter with custom configuration
33
-
34
- Examples
35
- --------
36
- >>> from sparknlp.reader import SparkNLPReader
37
- >>> html_df = SparkNLPReader().html(spark, "https://www.wikipedia.org")
38
-
39
- You can use SparkNLP for one line of code
40
- >>> import sparknlp
41
- >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
42
- >>> html_df.show(truncate=False)
43
-
44
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
45
- |url |html |
46
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
47
- |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
48
- +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
49
- >>> html_df.printSchema()
50
-
51
- root
52
- |-- url: string (nullable = true)
53
- |-- html: array (nullable = true)
54
- | |-- element: struct (containsNull = true)
55
- | | |-- elementType: string (nullable = true)
56
- | | |-- content: string (nullable = true)
57
- | | |-- metadata: map (nullable = true)
58
- | | | |-- key: string
59
- | | | |-- value: string (valueContainsNull = true)
60
-
61
-
62
-
63
- Instantiates class to read email files.
64
-
65
- emailPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
66
- "path/html/emails"
67
-
68
- Examples
69
- --------
70
- >>> from sparknlp.reader import SparkNLPReader
71
- >>> email_df = SparkNLPReader().email(spark, "home/user/emails-directory")
72
-
73
- You can use SparkNLP for one line of code
74
- >>> import sparknlp
75
- >>> email_df = sparknlp.read().email("home/user/emails-directory")
76
- >>> email_df.show(truncate=False)
77
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
78
- |email |
79
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
80
- |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >}}, {NarrativeText, Email test with two text attachments\r \n \r \n Cheers,\r \n \r \n , {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, mimeType -> text/plain}}, {NarrativeText, <html>\r \n <head>\r \n <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r \n <style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r \n </head>\r \n <body dir="ltr">\r \n <span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email test with two text attachments</span>\r \n <div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r \n <br>\r \n </div>\r \n <div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r \n Cheers,</div>\r \n <div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r \n <br>\r \n </div>\r \n </body>\r \n </html>\r \n , {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n , {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n , {sent_to -> Danilo Burbano <[email protected] >, sent_from -> Danilo Burbano <[email protected] >, mimeType -> text/plain}}]|
81
- +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
82
- email_df.printSchema()
83
- root
84
- |-- path: string (nullable = true)
85
- |-- content: array (nullable = true)
86
- |-- email: array (nullable = true)
87
- | |-- element: struct (containsNull = true)
88
- | | |-- elementType: string (nullable = true)
89
- | | |-- content: string (nullable = true)
90
- | | |-- metadata: map (nullable = true)
91
- | | | |-- key: string
92
- | | | |-- value: string (valueContainsNull = true)
93
-
30
+ A dictionary with custom configurations.
94
31
"""
95
32
96
33
def __init__ (self , spark , params = None ):
@@ -100,22 +37,77 @@ def __init__(self, spark, params=None):
100
37
self .spark = spark
101
38
102
39
def html (self , htmlPath ):
40
+ """Reads HTML files or URLs and returns a Spark DataFrame.
41
+
42
+ Parameters
43
+ ----------
44
+ htmlPath : str or list of str
45
+ Path(s) to HTML file(s) or a list of URLs.
46
+
47
+ Returns
48
+ -------
49
+ pyspark.sql.DataFrame
50
+ A DataFrame containing the parsed HTML content.
51
+
52
+ Examples
53
+ --------
54
+ >>> from sparknlp.reader import SparkNLPReader
55
+ >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
56
+
57
+ You can also use SparkNLP to simplify the process:
58
+
59
+ >>> import sparknlp
60
+ >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
61
+ >>> html_df.show(truncate=False)
62
+ """
103
63
if not isinstance (htmlPath , (str , list )) or (isinstance (htmlPath , list ) and not all (isinstance (item , str ) for item in htmlPath )):
104
64
raise TypeError ("htmlPath must be a string or a list of strings" )
105
65
jdf = self ._java_obj .html (htmlPath )
106
- dataframe = self .getDataFrame (self .spark , jdf )
107
- return dataframe
66
+ return self .getDataFrame (self .spark , jdf )
108
67
109
68
def email (self , filePath ):
69
+ """Reads email files and returns a Spark DataFrame.
70
+
71
+ Parameters
72
+ ----------
73
+ filePath : str
74
+ Path to an email file or a directory containing emails.
75
+
76
+ Returns
77
+ -------
78
+ pyspark.sql.DataFrame
79
+ A DataFrame containing parsed email data.
80
+
81
+ Examples
82
+ --------
83
+ >>> from sparknlp.reader import SparkNLPReader
84
+ >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
85
+
86
+ Using SparkNLP:
87
+
88
+ >>> import sparknlp
89
+ >>> email_df = sparknlp.read().email("home/user/emails-directory")
90
+ >>> email_df.show(truncate=False)
91
+ """
110
92
if not isinstance (filePath , str ):
111
93
raise TypeError ("filePath must be a string" )
112
94
jdf = self ._java_obj .email (filePath )
113
- dataframe = self .getDataFrame (self .spark , jdf )
114
- return dataframe
95
+ return self .getDataFrame (self .spark , jdf )
115
96
116
97
def doc (self , docPath ):
98
+ """Reads document files and returns a Spark DataFrame.
99
+
100
+ Parameters
101
+ ----------
102
+ docPath : str
103
+ Path to a document file.
104
+
105
+ Returns
106
+ -------
107
+ pyspark.sql.DataFrame
108
+ A DataFrame containing parsed document content.
109
+ """
117
110
if not isinstance (docPath , str ):
118
111
raise TypeError ("docPath must be a string" )
119
112
jdf = self ._java_obj .doc (docPath )
120
- dataframe = self .getDataFrame (self .spark , jdf )
121
- return dataframe
113
+ return self .getDataFrame (self .spark , jdf )
0 commit comments