Add initial attempt at pagerank with hacky tests (#1)

eriknw · web-flow · commit 67a9e4d8173c · 2022-04-04T14:31:27.000-05:00
* Add initial attempt at pagerank with hacky tests

* Install networkx too

* Fix failing test; maybe faster/better?

* Even better

* Split pagerank into two functions: grblas-native and networkx-facing.

Also, optimize if adjacency matrix is iso-valued.
I would bring this implementation to a benchmarking shootout!

* Change how we convert NetworkX dicts to vectors.

Also, sparsify vectors.  May be a decent idea?  Not sure.  It probably
doesn't matter most of the time, but I guess there's a chance it can make
the matrix-vector multiply faster for some inputs.  We don't drop 0s from
the input matrix, because that would be expensive.

* Clean up

* Don't be cute; don't use masks, because they're bad for benchmarks.

Also, add basic benchmark script.

* Update to use latest grblas; also, add verify option to bench script.

* Show grid of absolute differences between benchmark results
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -30,7 +30,7 @@ jobs:
           activate-environment: testing
       - name: Install dependencies
         run: |
-          conda install -c conda-forge grblas pytest coverage black flake8 coveralls
+          conda install -c conda-forge grblas networkx scipy pytest coverage black flake8 coveralls
           pip install -e .
       - name: Style checks
         run: |
diff --git a/README.md b/README.md
@@ -1,13 +1,18 @@
 # **GraphBLAS Algorithms**
 
-[![conda-forge](https://img.shields.io/conda/vn/conda-forge/graphblas-algorithms.svg)](https://anaconda.org/conda-forge/graphblas-algorithms)
 [![pypi](https://img.shields.io/pypi/v/graphblas-algorithms.svg)](https://pypi.python.org/pypi/graphblas-algorithms/)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/metagraph-dev/graphblas-algorithms/blob/main/LICENSE)
 [![Tests](https://github.com/metagraph-dev/graphblas-algorithms/workflows/Tests/badge.svg?branch=main)](https://github.com/metagraph-dev/graphblas-algorithms/actions)
-[![Docs](https://readthedocs.org/projects/graphblas-algorithms/badge/?version=latest)](https://graphblas-algorithms.readthedocs.io/en/latest/)
 [![Coverage](https://coveralls.io/repos/metagraph-dev/graphblas-algorithms/badge.svg?branch=main)](https://coveralls.io/r/metagraph-dev/graphblas-algorithms)
 [![Code style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+<!--- [![conda-forge](https://img.shields.io/conda/vn/conda-forge/graphblas-algorithms.svg)](https://anaconda.org/conda-forge/graphblas-algorithms) --->
+<!--- [![Docs](https://readthedocs.org/projects/graphblas-algorithms/badge/?version=latest)](https://graphblas-algorithms.readthedocs.io/en/latest/) --->
 
-GraphBLAS algorithms written in Python with [`grblas`](https://github.com/metagraph-dev/grblas).
+GraphBLAS algorithms written in Python with [`grblas`](https://github.com/metagraph-dev/grblas).  We are trying to target the NetworkX API algorithms where possible.
 
-This is a work in progress.  Stay tuned!
+### Installation
+```
+pip install graphblas-algorithms
+```
+
+This is a work in progress.  Stay tuned (or come help 😃)!
diff --git a/graphblas_algorithms/__init__.py b/graphblas_algorithms/__init__.py
@@ -1,3 +1,4 @@
 from . import _version
+from .link_analysis import pagerank  # noqa
 
 __version__ = _version.get_versions()["version"]
diff --git a/graphblas_algorithms/conftest.py b/graphblas_algorithms/conftest.py
@@ -0,0 +1 @@
+from networkx.conftest import *  # noqa
diff --git a/graphblas_algorithms/link_analysis.py b/graphblas_algorithms/link_analysis.py
@@ -0,0 +1,155 @@
+from collections import OrderedDict
+from warnings import warn
+
+import grblas as gb
+import networkx as nx
+from grblas import Vector, binary, unary
+from grblas.semiring import plus_first, plus_times
+
+
+def pagerank_core(
+    A,
+    alpha=0.85,
+    personalization=None,
+    max_iter=100,
+    tol=1e-06,
+    nstart=None,
+    dangling=None,
+    row_degrees=None,
+    name="pagerank",
+):
+    N = A.nrows
+    if A.nvals == 0:
+        return Vector.new(float, N, name=name)
+
+    # Initial vector
+    x = Vector.new(float, N, name="x")
+    if nstart is None:
+        x[:] = 1.0 / N
+    else:
+        denom = nstart.reduce(allow_empty=False).value
+        if denom == 0:
+            raise ZeroDivisionError()
+        x << nstart / denom
+
+    # Personalization vector or scalar
+    if personalization is None:
+        p = 1.0 / N
+    else:
+        denom = personalization.reduce(allow_empty=False).value
+        if denom == 0:
+            raise ZeroDivisionError()
+        p = (personalization / denom).new(name="p")
+
+    # Inverse of row_degrees
+    # Fold alpha constant into S
+    if row_degrees is None:
+        S = A.reduce_rowwise().new(float, name="S")
+        S << alpha / S
+    else:
+        S = (alpha / row_degrees).new(name="S")
+
+    if A.ss.is_iso:
+        # Fold iso-value of A into S
+        # This lets us use the plus_first semiring, which is faster
+        iso_value = A.ss.iso_value
+        if iso_value != 1:
+            S *= iso_value
+        semiring = plus_first[float]
+    else:
+        semiring = plus_times[float]
+
+    is_dangling = S.nvals < N
+    if is_dangling:
+        dangling_mask = Vector.new(float, N, name="dangling_mask")
+        dangling_mask(mask=~S.S) << 1.0
+        # Fold alpha constant into dangling_weights (or dangling_mask)
+        if dangling is not None:
+            dangling_weights = (alpha / dangling.reduce(allow_empty=False).value * dangling).new(
+                name="dangling_weights"
+            )
+        elif personalization is None:
+            # Fast case (and common case); is iso-valued
+            dangling_mask(mask=dangling_mask.S) << alpha * p
+        else:
+            dangling_weights = (alpha * p).new(name="dangling_weights")
+
+    # Fold constant into p
+    p *= 1 - alpha
+
+    # Power iteration: make up to max_iter iterations
+    xprev = Vector.new(float, N, name="x_prev")
+    w = Vector.new(float, N, name="w")
+    for _ in range(max_iter):
+        xprev, x = x, xprev
+
+        # x << alpha * ((xprev * S) @ A + "dangling_weights") + (1 - alpha) * p
+        x << p
+        if is_dangling:
+            if dangling is None and personalization is None:
+                # Fast case: add a scalar; x is still iso-valued (b/c p is also scalar)
+                x += xprev @ dangling_mask
+            else:
+                # Add a vector
+                x += plus_first(xprev @ dangling_mask) * dangling_weights
+        w << xprev * S
+        x += semiring(w @ A)  # plus_first if A.ss.is_iso else plus_times
+
+        # Check convergence, l1 norm: err = sum(abs(xprev - x))
+        xprev << binary.minus(xprev | x, require_monoid=False)
+        xprev << unary.abs(xprev)
+        err = xprev.reduce().value
+        if err < N * tol:
+            x.name = name
+            return x
+    raise nx.PowerIterationFailedConvergence(max_iter)
+
+
+def pagerank(
+    G,
+    alpha=0.85,
+    personalization=None,
+    max_iter=100,
+    tol=1e-06,
+    nstart=None,
+    weight="weight",
+    dangling=None,
+):
+    warn("", DeprecationWarning, stacklevel=2)
+    N = len(G)
+    if N == 0:
+        return {}
+    node_ids = OrderedDict((k, i) for i, k in enumerate(G))
+    A = gb.io.from_networkx(G, nodelist=node_ids, weight=weight, dtype=float)
+
+    x = p = dangling_weights = None
+    # Initial vector (we'll normalize later)
+    if nstart is not None:
+        indices, values = zip(*((node_ids[key], val) for key, val in nstart.items()))
+        x = Vector.from_values(indices, values, size=N, dtype=float, name="nstart")
+    # Personalization vector (we'll normalize later)
+    if personalization is not None:
+        indices, values = zip(*((node_ids[key], val) for key, val in personalization.items()))
+        p = Vector.from_values(indices, values, size=N, dtype=float, name="personalization")
+    # Dangling nodes (we'll normalize later)
+    row_degrees = A.reduce_rowwise().new(name="row_degrees")
+    if dangling is not None:
+        if row_degrees.nvals < N:  # is_dangling
+            indices, values = zip(*((node_ids[key], val) for key, val in dangling.items()))
+            dangling_weights = Vector.from_values(
+                indices, values, size=N, dtype=float, name="dangling"
+            )
+    result = pagerank_core(
+        A,
+        alpha=alpha,
+        personalization=p,
+        max_iter=max_iter,
+        tol=tol,
+        nstart=x,
+        dangling=dangling_weights,
+        row_degrees=row_degrees,
+    )
+    if result.nvals != N:
+        # Not likely, but fill with 0 just in case
+        result(mask=~result.S) << 0
+    return dict(zip(node_ids, result.to_values()[1]))
diff --git a/graphblas_algorithms/tests/__init__.py b/graphblas_algorithms/tests/__init__.py
diff --git a/graphblas_algorithms/tests/test_pagerank.py b/graphblas_algorithms/tests/test_pagerank.py
@@ -0,0 +1,21 @@
+import inspect
+
+import networkx as nx
+
+from graphblas_algorithms import pagerank
+
+nx_pagerank = nx.pagerank
+nx_pagerank_scipy = nx.pagerank_scipy
+
+nx.pagerank = pagerank
+nx.pagerank_scipy = pagerank
+nx.algorithms.link_analysis.pagerank_alg.pagerank_scipy = pagerank
+
+
+def test_signatures():
+    nx_sig = inspect.signature(nx_pagerank)
+    sig = inspect.signature(pagerank)
+    assert nx_sig == sig
+
+
+from networkx.algorithms.link_analysis.tests.test_pagerank import *  # isort:skip
diff --git a/scripts/bench_pagerank.py b/scripts/bench_pagerank.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`from . import _version`
	`2`	`+from .link_analysis import pagerank # noqa`
`2`	`3`
`3`	`4`	`__version__ = _version.get_versions()["version"]`