Skip to content

HTML parsing library, the alternative to BeautifulSoup in Golang.

License

Notifications You must be signed in to change notification settings

sunshineplan/node

Repository files navigation

node

GoDev Go CoverageStatus GoReportCard

Node is a Go package for parsing HTML and XML documents, inspired by the popular Python library Beautiful Soup. Node provides APIs for extracting data from HTML and XML documents by traversing the parse tree and accessing elements and attributes.

Installation

To install Node, use the go get command:

go get -u github.com/sunshineplan/node

Usage

package main

import (
	"fmt"
	"strings"

	"github.com/sunshineplan/node"
)

func main() {
	// Parse an HTML document from a string
	doc, err := node.ParseHTML("<html><head><title>Page Title</title></head><body><p>Hello, World!</p></body></html>")
	if err != nil {
		fmt.Println(err)
		return
	}

	// Find the page title
	title := doc.Find(node.Descendant, node.Title)
	fmt.Println(title.GetText())

	// Find the first paragraph
	p := doc.Find(node.Descendant, node.P)
	fmt.Println(p.GetText())

	// Find all paragraphs
	paragraphs := doc.FindAll(node.Descendant, node.P)
	for _, p := range paragraphs {
		fmt.Println(p.GetText())
	}
}

API

// Node is an interface representing an HTML node.
type Node interface {
	HtmlNode

	// String returns a TextNode if the node has only one child whose type is text, otherwise returns nil.
	String() TextNode

	// Strings return all of the text nodes inside this node.
	Strings() []TextNode

	// StrippedStrings return a list of strings generated by Strings, where strings consisting entirely of
	// whitespace are ignored, and whitespace at the beginning and end of strings is removed.
	StrippedStrings() []string

	// GetText concatenates all of the text node's content.
	GetText() string
}

// TextNode is an interface representing a text node.
type TextNode interface {
	HtmlNode

	// String returns content for text node.
	String() string
}

// HtmlNode is an interface representing an HTML node.
type HtmlNode interface {
	// Raw returns orgin *html.Node.
	Raw() *html.Node
	// ToNode converts HtmlNode to Node.
	ToNode() Node
	// ToTextNode converts HtmlNode to TextNode.
	// It will panic if the node type is not text node.
	ToTextNode() TextNode

	// Type returns a NodeType.
	Type() html.NodeType
	// Data returns tag name for element node or content for text node.
	Data() string
	// Attrs returns an Attributes interface for element node.
	Attrs() Attributes
	// HasAttr return whether node has an attribute.
	HasAttr(string) bool
	// HTML renders the node's parse tree as HTML code.
	HTML() string
	// Readable renders unescaped HTML code.
	Readable() string

	// Parent returns the parent of this node.
	Parent() Node
	// FirstChild returns the first child of this node.
	FirstChild() Node
	// LastChild returns the last child of this node.
	LastChild() Node
	// PrevSibling returns the previous node that are on the same level of the parse tree.
	PrevSibling() Node
	// NextSibling returns the next node that are on the same level of the parse tree.
	NextSibling() Node
	// PrevNode returns the node that was parsed immediately before this node.
	PrevNode() Node
	// NextNode returns the node that was parsed immediately after this node.
	NextNode() Node

	// Parents iterate over all of this node's parent recursively.
	Parents() []Node
	// Children return all of this node's direct children.
	Children() []Node
	// Descendants iterate over all of this node's children recursively.
	Descendants() []Node
	// PrevSiblings return all of this node's previous nodes that are on the same level of the parse tree.
	PrevSiblings() []Node
	// NextSiblings return all of this node's next nodes that are on the same level of the parse tree.
	NextSiblings() []Node
	// PrevNodes return all of the nodes that was parsed before this node.
	PrevNodes() []Node
	// NextNodes return all of the nodes that was parsed after this node.
	NextNodes() []Node

	// AncestorNodes returns an iterator over the ancestors of n,
	// starting with n.Parent.
	AncestorNodes() iter.Seq[Node]
	// ChildNodes returns an iterator over the immediate children of n,
	// starting with n.FirstChild.
	ChildNodes() iter.Seq[Node]
	// DescendantNodes returns an iterator over all nodes recursively
	// beneath n, excluding n itself. Nodes are visited in depth-first preorder.
	DescendantNodes() iter.Seq[Node]

	// Finder includes a set of find methods.
	Finder
}

// Attributes is an interface that describes a node's attributes with
// methods for getting and iterating over key-value pairs.
type Attributes interface {
	// Range calls the provided function for each key-value pair in the Attributes
	// iteration stops if the function returns false for any pair.
	Range(func(key, value string) bool)

	// Get returns the value associated with the specified key and
	// a boolean indicating whether the key exists in the Attributes.
	Get(key string) (value string, exists bool)
}

// Finder represents a set of methods for finding nodes.
type Finder interface {
	// Find searches for the first matched node in the parse tree based on the specified find method and filters.
	Find(FindMethod, TagFilter, ...Filter) Node

	// FindN searches for up to n nodes in the parse tree based on the specified find method and filters.
	FindN(FindMethod, int, TagFilter, ...Filter) []Node

	// FindAll searches for all nodes in the parse tree based on the specified find method and filters.
	FindAll(FindMethod, TagFilter, ...Filter) []Node

	// FindString searches for the first matched text node in the parse tree based on the specified find method and filters.
	FindString(FindMethod, StringFilter) TextNode

	// FindStringN searches for up to n text nodes in the parse tree based on the specified find method and filters.
	FindStringN(FindMethod, int, StringFilter) []TextNode

	// FindAllString searches for all text nodes in the parse tree based on the specified find method and filters.
	FindAllString(FindMethod, StringFilter) []TextNode

	// CSS selectors support

	// Select searches for the first matched node in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	Select(string) Node

	// SelectAll searches for all nodes in the parse tree based on the css selector.
	// Will panics if the selector cannot be parsed.
	SelectAll(string) []Node

	// xpath support

	// XPath searches for all node that matches by the specified XPath expr. Will panics if the expression cannot be parsed.
	XPath(string) []Node

	// Evaluate returns the result of the xpath expression.
	// The result type of the expression is one of the follow: bool, float64, string, *xpath.NodeIterator.
	Evaluate(string) (any, error)
}

// FindMethod represents the method used to search for nodes in the parse tree.
type FindMethod int

const (
	// Descendant represents a search for nodes that are descendants of the current node.
	Descendant FindMethod = iota

	// NoRecursive represents a search for nodes that are direct children of the current node.
	NoRecursive

	// Parent represents a search for the parent node of the current node.
	Parent

	// PrevSibling represents a search for the previous sibling node of the current node.
	PrevSibling

	// NextSibling represents a search for the next sibling node of the current node.
	NextSibling

	// Previous represents a search for the previous node in the parse tree.
	Previous

	// Next represents a search for the next node in the parse tree.
	Next
)

// TagFilter represents an interface that can be used to filter node based on node element's tag.
type TagFilter interface {
	Ignore() bool
	IsMatch(node Node) bool
}

// Filter is an interface that describes a filter that can be used to select nodes.
type Filter interface {
	// IsAttribute returns true if the filter represents an attribute filter.
	IsAttribute() bool

	// IsMatch returns true if the filter matches the given node.
	IsMatch(node Node) bool
}

// StringFilter interface extends the Filter interface and defines
// a method for checking if the filter represents an string filter.
type StringFilter interface {
	Filter
	IsString() bool
}

Credits

This repo relies on the following third-party projects:

License

The MIT License (MIT)