Skip to content

Commit 9948696

Browse files
committed
Fix duplicate docstring causing docs not to build [run doc]
1 parent 69de5c2 commit 9948696

File tree

1 file changed

+68
-76
lines changed

1 file changed

+68
-76
lines changed

python/sparknlp/reader/sparknlp_reader.py

Lines changed: 68 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -15,82 +15,19 @@
1515

1616

1717
class SparkNLPReader(ExtendedJavaWrapper):
18-
"""Instantiates class to read HTML files.
18+
"""Instantiates class to read HTML, email, and document files.
1919
20-
Two types of input paths are supported,
20+
Two types of input paths are supported:
2121
22-
htmlPath: this is a path to a directory of HTML files or a path to an HTML file
23-
E.g. "path/html/files"
24-
25-
url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
22+
- `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`).
23+
- `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`).
2624
2725
Parameters
2826
----------
29-
params : spark
30-
Spark session
27+
spark : SparkSession
28+
The active Spark session.
3129
params : dict, optional
32-
Parameter with custom configuration
33-
34-
Examples
35-
--------
36-
>>> from sparknlp.reader import SparkNLPReader
37-
>>> html_df = SparkNLPReader().html(spark, "https://www.wikipedia.org")
38-
39-
You can use SparkNLP for one line of code
40-
>>> import sparknlp
41-
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
42-
>>> html_df.show(truncate=False)
43-
44-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
45-
|url |html |
46-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
47-
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
48-
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
49-
>>> html_df.printSchema()
50-
51-
root
52-
|-- url: string (nullable = true)
53-
|-- html: array (nullable = true)
54-
| |-- element: struct (containsNull = true)
55-
| | |-- elementType: string (nullable = true)
56-
| | |-- content: string (nullable = true)
57-
| | |-- metadata: map (nullable = true)
58-
| | | |-- key: string
59-
| | | |-- value: string (valueContainsNull = true)
60-
61-
62-
63-
Instantiates class to read email files.
64-
65-
emailPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
66-
"path/html/emails"
67-
68-
Examples
69-
--------
70-
>>> from sparknlp.reader import SparkNLPReader
71-
>>> email_df = SparkNLPReader().email(spark, "home/user/emails-directory")
72-
73-
You can use SparkNLP for one line of code
74-
>>> import sparknlp
75-
>>> email_df = sparknlp.read().email("home/user/emails-directory")
76-
>>> email_df.show(truncate=False)
77-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
78-
|email |
79-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
80-
|[{Title, Email Text Attachments, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>}}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, mimeType -> text/plain}}, {NarrativeText, <html>\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>\r\n</head>\r\n<body dir="ltr">\r\n<span style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">Email&nbsp; test with two text attachments</span>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\nCheers,</div>\r\n<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">\r\n<br>\r\n</div>\r\n</body>\r\n</html>\r\n, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano <[email protected]>, sent_from -> Danilo Burbano <[email protected]>, mimeType -> text/plain}}]|
81-
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
82-
email_df.printSchema()
83-
root
84-
|-- path: string (nullable = true)
85-
|-- content: array (nullable = true)
86-
|-- email: array (nullable = true)
87-
| |-- element: struct (containsNull = true)
88-
| | |-- elementType: string (nullable = true)
89-
| | |-- content: string (nullable = true)
90-
| | |-- metadata: map (nullable = true)
91-
| | | |-- key: string
92-
| | | |-- value: string (valueContainsNull = true)
93-
30+
A dictionary with custom configurations.
9431
"""
9532

9633
def __init__(self, spark, params=None):
@@ -100,22 +37,77 @@ def __init__(self, spark, params=None):
10037
self.spark = spark
10138

10239
def html(self, htmlPath):
40+
"""Reads HTML files or URLs and returns a Spark DataFrame.
41+
42+
Parameters
43+
----------
44+
htmlPath : str or list of str
45+
Path(s) to HTML file(s) or a list of URLs.
46+
47+
Returns
48+
-------
49+
pyspark.sql.DataFrame
50+
A DataFrame containing the parsed HTML content.
51+
52+
Examples
53+
--------
54+
>>> from sparknlp.reader import SparkNLPReader
55+
>>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")
56+
57+
You can also use SparkNLP to simplify the process:
58+
59+
>>> import sparknlp
60+
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
61+
>>> html_df.show(truncate=False)
62+
"""
10363
if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
10464
raise TypeError("htmlPath must be a string or a list of strings")
10565
jdf = self._java_obj.html(htmlPath)
106-
dataframe = self.getDataFrame(self.spark, jdf)
107-
return dataframe
66+
return self.getDataFrame(self.spark, jdf)
10867

10968
def email(self, filePath):
69+
"""Reads email files and returns a Spark DataFrame.
70+
71+
Parameters
72+
----------
73+
filePath : str
74+
Path to an email file or a directory containing emails.
75+
76+
Returns
77+
-------
78+
pyspark.sql.DataFrame
79+
A DataFrame containing parsed email data.
80+
81+
Examples
82+
--------
83+
>>> from sparknlp.reader import SparkNLPReader
84+
>>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
85+
86+
Using SparkNLP:
87+
88+
>>> import sparknlp
89+
>>> email_df = sparknlp.read().email("home/user/emails-directory")
90+
>>> email_df.show(truncate=False)
91+
"""
11092
if not isinstance(filePath, str):
11193
raise TypeError("filePath must be a string")
11294
jdf = self._java_obj.email(filePath)
113-
dataframe = self.getDataFrame(self.spark, jdf)
114-
return dataframe
95+
return self.getDataFrame(self.spark, jdf)
11596

11697
def doc(self, docPath):
98+
"""Reads document files and returns a Spark DataFrame.
99+
100+
Parameters
101+
----------
102+
docPath : str
103+
Path to a document file.
104+
105+
Returns
106+
-------
107+
pyspark.sql.DataFrame
108+
A DataFrame containing parsed document content.
109+
"""
117110
if not isinstance(docPath, str):
118111
raise TypeError("docPath must be a string")
119112
jdf = self._java_obj.doc(docPath)
120-
dataframe = self.getDataFrame(self.spark, jdf)
121-
return dataframe
113+
return self.getDataFrame(self.spark, jdf)

0 commit comments

Comments
 (0)