-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathTextExtractor.cs
35 lines (32 loc) · 1.05 KB
/
TextExtractor.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
using HtmlAgilityPack;
namespace AkkaWordCounter2.App;
public static class TextExtractor
{
/// <summary>
/// Extracts raw text from a HtmlDocument
/// </summary>
/// <remarks>
/// Shouldn't pick up stuff from script / style tags etc
/// </remarks>
public static IEnumerable<string> ExtractText(HtmlDocument htmlDocument)
{
var root = htmlDocument.DocumentNode;
foreach (var node in root.Descendants()
.Where(n => n.NodeType == HtmlNodeType.Text &&
n.ParentNode.Name != "script" &&
n.ParentNode.Name != "style"))
{
string text = node.InnerText.Trim();
if (!string.IsNullOrEmpty(text))
yield return text;
}
}
public static IEnumerable<string> ExtractTokens(string text)
{
var tokens = text.Split([' ', '\n', '\r', '\t'], StringSplitOptions.RemoveEmptyEntries);
foreach (var token in tokens)
{
yield return token.Trim();
}
}
}