chapter3/bastnet.tex

\section{Inferring the weight sharing scheme}
\label{sec:trans}

\subsection{Methodology}

As we saw in \secref{sec:sym} (\figref{fig:rect}), the classical convolution on grid graphs can be obtained visually by translating a rectangular window over the pixel domain. The idea of this section is to define similarly a convolution on graph domains, using a notion of translation defined on graphs~\citep{pasdeloup2017translations}. The translations we use match Euclidean translations on 2D grid graphs~\citep{GrePasViaGri201610}, and extend them on general ones, by preserving three simple key properties: injectivity, edge-constraint, and neighborhood-preservation, which we will detail later.

Given a set of graph translations $\ct = \{t_i, i \in \seq{\kappa - 1}\} \cup \{t_0 = \id \}$, the corresponding convolution can be expressed using the same formalism we used in \chapref{chap:2}, as:
\begin{gather}
w \ast x = \displaystyle\sum_{t \in \ct} w[t] \h{2} t(x)
\end{gather}
where $x$ is a signal over vertices of a graph $\gve$, and $w$ is a signal defined on $\ct \subset \Phi_{\EC}(G)$. Depending on the algebraic nature of $\ct$ (that we will discuss later), the theoretical results obtained in \chapref{chap:2} hold. Under the ternary representation, finding such translations amounts to infer the weight sharing scheme $S$.

By denoting $w_i = w[t_i]$, we obtain the more familiar expression:
\begin{gather}
\forall u \in V, w \ast x [u] = \displaystyle\sum_{i=0}^{\kappa-1} w_i \h{2} x[t_i^{-1}(u)]
\end{gather}
This expression extends the definition of convolutions from grid graphs to general graphs, with the use of graph translations. We use it to obtain extended CNNs that can be applied on graphs. In this manuscript, we coin the term \emph{Translation-convolutional neural network} (TCNN) to refer to them, but they are the same as the extended convolutions we defined in \cite{pasdeloup2017convolutional}.

Besides defining translation-convolution on general graphs, designing a TCNN also implies extending the other building blocks of classical CNNs. As a counterpart for pooling operations, we define a graph subsampling and apply strided translation-convolution. The translations of the subsampled graph are derived from those of the base graph to define translation-convolution at the downscaled level. We will also use a weak form of data augmentation using these translations. \figref{fig:outline} depicts the proposed methodology~\citep{lassance2018matching}.

\input{chapter3/fig_tcnn}

\subsection{Translations}

% Define a graph $G = \langle V, E \rangle$ with $V$ the set of vertices, and $E \subseteq\binom{V}{2}$ the set of edges. We suppose the graph is connected, as conversely the process can be applied to each connected component of $G$. We denote by $d$ the max degree of the graph and $n = |V|$ the number of vertices.

% The authors of~\cite{pasdeloup2017convolutional} propose to inductively define translations as functions from vertices to vertices as follows:

Let a graph $\gve$. We suppose the graph is connected, as conversely the process can be applied to each connected component of $G$. We denote by $d$ the max degree of the graph and $n = |V|$ the number of vertices.

\begin{definition}\textbf{Candidate-translation}\\
  A \emph{candidate-translation} is a function $\phi: U \to V$, where $U \subset V$ and such that:
  \begin{itemize}[noitemsep,nolistsep]
  \item $\phi$ is \emph{injective}:\\
  $\forall v,v' \in U, \phi(v) = \phi(v') \Rightarrow v = v',$
  \item $\phi$ is \emph{edge-constrained}:\\
  $\forall v \in U, (v,\phi(v)) \in E,$
  \item $\phi$ is \emph{strongly neighborhood-preserving}:\\
  $\forall v,v' \in U, (v,v')\in E \Leftrightarrow (\phi(v),\phi(v')) \in E.$
  \end{itemize}
\end{definition}

The cardinal $|V-U|$ is called the \emph{loss} of $\phi$.
A translation for which $V=U$ is called a \emph{lossless} translation.
Two candidate-translations $\phi$ and $\phi'$ are said to be \emph{aligned} if $\exists v\in U, \phi(v) = \phi'(v)$.
We define $N_r(v)$ as the set of vertices that are at most $r$-hop away from a vertex $v \in V$.\\

\begin{definition}\textbf{Translation}\\
  A \emph{translation} in a graph $G$ is a candidate-translation such that there is no aligned translation with a strictly smaller loss, or is the identity function.
\end{definition}

% \begin{remark}
If the graph is a 2D grid, obtained translations are exactly natural translations on images~\cite{GrePasViaGri201610}.
% \end{remark}

Because translations and candidate-translations need not be surjective on~$V$, we introduce a zero vertex\footnote{Sometimes called \emph{black hole.}~\citep{GrePasViaGri201610}} denoted $0_V$, such that any candidate translation $\phi: U \to V$ is extended as a function $\phi: V \to V \cup \{\bot\}$
\begin{gather}
\forall v \notin U, \phi(v) = 0_V
\end{gather}

Finding translations is an NP-complete problem~\citep{pasdeloup2017translations}. So in practice, we will first search locally. For this reason, we define local translations:

\begin{definition}\textbf{Local translation}\\
A \emph{local translation} of center $v \in V$ is a translation in the subgraph of $G$ induced by $N_2(v)$, that has $v$ in its definition domain.
\end{definition}

With the help of local translations, we can construct proxies to global translations.

\begin{definition}\textbf{Proxy-translations}\\
A family of \emph{proxy-translations} $(\psi_p)_{p=0,..\kappa-1}$ initialized by $v_0 \in V$ is defined algorithmically as follows:
\begin{enumerate}
\item We place an indexing kernel on $N_1(v_0) $ i.e.\\ $N_1(v_0) = \{v_0, v_1, ..., v_{\kappa-1}\}$ with $\forall p, \psi_p(v_0) = v_p$,
\item We move this kernel using each local translation $\phi$ of center $v_0$:\\ $\forall p, \psi_p(\phi(v_0)) = \phi(v_p)$,
\item We repeat 2) from each new center reached until saturation. If a center is being reached again, we keep the indexing that minimizes the sum of losses of the local translations that has lead to it.
\end{enumerate}
\end{definition}

We explain this algorithm in more details in the next \secref{sec:alg}. A family of proxy-translations defines a translation-convolution as follows:

\begin{definition}\textbf{Translation-convolution layer}\\
Let $(\psi_p)_{p=0,..,\kappa-1}$ be a family of proxy-translations identified on $G$ \st $\psi_0 = \id$.
The \emph{translation-convolution layer} $\cl: x \mapsto y$ is defined as:
\begin{gather*}
\forall v \in V, y[v] = h\left(\sum_{p=0}^{\kappa-1}{w_p \h{2} x[\psi_p(v)]} + b\right)
\end{gather*}

where $h$ is the activation function, $b$ is the bias term, and with the convention that $x[0_V] = 0$.
\end{definition}

% \subsection{Connection with action convolutions}

% \todo{See chapter 2}

% \begin{itemize}[noitemsep,nolistsep]
%   \item lossless translations = group action convolution
%   \item translations = groupoid action convolution
%   \item proxy translations = path convolution
% \end{itemize}

% \todo{SNP == abelian?}
% \todo{state the properties obtained from results of chapter 2}

\subsection{Finding proxy-translations}
\label{sec:alg}
%Finding translations is an NP-complete problem~\cite{pasdeloup2017translations}, such that for large graphs the method is not suitable. 
%In order to break down complexity, we propose to search for local translations. They also introduce approximate translations which we omit for the sake of simplicity, but the description would be similar.

We describe in three steps how we efficiently find proxy-translations.

\paragraph{First step: finding local translations}

For each vertex $v \in G$, we identify all local translations using a bruteforce algorithm. This process requires finding all translations in all induced subgraphs. There are $n$ such subgraphs, each one contains at most $d$ local translations. Finding a translation can be performed by looking at all possible injections from 1-hop vertices around the central vertex to any vertex that is at most 2-hops away. We conclude that it requires at most $\mathcal{O}(nd d^{2(d+1)})$ elementary operations and is thus linear with the order of the graph. On the other hand, it suggests that sparsity of the graph is a key criterion in order to maintain the complexity reasonable.

\figref{fig:gridgraph} depicts an example of a grid graph and the induced subgraph around vertex $v_0$. \figref{fig:inducedtranslations} depicts all obtained translations in the induced subgraph.

\input{chapter3/fig_gridgraph}
\input{chapter3/fig_inducedtranslations}

\paragraph{Second step: move a small localized kernel}

Given an arbitrary\footnote{In practice we run several experiments while changing the initial vertex and keep the best obtained result.} vertex $v_0 \in V$, we place an indexing kernel on $N_1(v_0)$ i.e. $N_1(v_0) = \{v_0, v_1, ..., v_{\kappa-1}\}$. Then we move it using every local translations of center $v_0$, repeating this process for each center that is reached for the first time. We stop when the kernel has been moved everywhere in the graph. In case of multiple paths leading to the same destination, we keep the indexing that minimizes the sum of loss of the series of local translations. We henceforth obtain an indexing of at most $\kappa$ objects of $N_1(v)$ for every $v \in V$.

This process is depicted in \figref{fig:movekernel}. Since it requires moving the kernel everywhere, its complexity is $\mathcal{O}(n d^2)$.

\input{chapter3/fig_movekernel}

\paragraph{Final step: identifying proxy-translations}

Finally, by looking at the indexings obtained in the previous step, we obtain a family of proxy-translations defined globally on $G$. More precisely, each index defines its own proxy-translation. Note that they are not translations because only the local properties have been propagated through the second step, so there can exist aligned candidates with smaller losses. Because of the constraint to keep the paths with the minimum sum of losses, they are good proxies to translations on $G$.

An illustration on a grid graph is given in \figref{fig:translations}. The complexity is $\mathcal{O}(nd)$. Overall, all three steps are linear in $n$.

\input{chapter3/fig_translations}

\subsection{Subsampling}

Downscaling is a tricky part of the process because it supposes one can somehow regularly sample vectors. As a matter of fact, a non-regular sampling is likely to produce a highly irregular downscaled graph, on which looking for translations irremediably leads to poor accuracy, as we noticed in our experiments.% Instead, here we propose to use translations on $G$ to define the extended downscaling layers. They basically are obtained by masking vertices in the initial graph.

We rather define the translations of the strided graph using the previously found proxy-translations on $G$.

\paragraph{First step: extended convolution with stride $r$}
%\subsubsection{First step: convolution with stride $r$}
%Here we define an extended convolution layer with stride $r$.

Given an arbitrary initial vertex $v_0 \in V$, the set of kept vertices $V_{\downarrow r}$ is defined inductively as follows:
\begin{itemize}[noitemsep,nolistsep]
\item $V_{\downarrow r}^0 = \{v_0\}$,
\item $\forall t \in \mathbb{N}, V_{\downarrow r}^{t+1} = V_{\downarrow r}^t \cup \{v \in V, \forall v' \in V_{\downarrow }^t, v \not\in N_{r-1}(v') \land \exists v' \in V_{\downarrow r}^t, v \in N_{r}(v') \}$.
\end{itemize}
This sequence is non-decreasing and bounded by $V$, so it eventually becomes stationary and we obtain $V_{\downarrow r} = \lim_t{V_{\downarrow r}^t}$. $V_{\downarrow r}$ is the set of output neurons of the extended convolution layer with stride~$r$. \figref{fig:downscaling} illustrate the first downscaling $V_{\downarrow 2}$ on a grid graph. %% $V_{\downarrow 2}$ is used to define the stride in extended convolution layers.

\paragraph{Second step: convolutions for the strided graph}

Using the proxy-translations on $G$, we move a localized $r$-hop indexing kernel over $G$. At each location, we associate the vertices of $V_{\downarrow r}$ with indices of the kernel, thus obtaining what we define as induced $_{\downarrow r}$-translations on the set $V_{\downarrow r}$. In other words, when the kernel is centered on $v_0$, if $v_1 \in V_{\downarrow r}$ is associated with the index $p_0$, we obtain $\phi_{p_0}^{\downarrow r}(v_0) = v_1$. Subsequent convolutions at lower scales are defined using these induced $_{\downarrow r}$-translations.
%The downstream convolution's weight-sharing schemes are then obtained by moving a kernel covering the full graph to each vertex in $V_{\downarrow r}$, similarly to Subsection C.

%% Note that every downscaling layers of the extended architecture are defined from the translation found on the graph at the input level of the network. A typical extended CNN architecture would contain ${2^r}$ downscaled layers.

\input{chapter3/fig_downscaling}

\subsection{Data augmentation}

Once proxy-translations are obtained on $G$, we use them to move training signals, artificially creating new ones. Note that this type of data-augmentation is weaker than for images since no flipping, scaling or rotations are used.

\subsection{Experiments}

\paragraph{Matching CNNs on supervised image classification}
On CIFAR-$10$ (see \secref{sec:datasets}), our models are based on a variant of a deep residual network, namely PreActResNet18 \citep{he2016identity}. We tested different combinations of graph support and data augmentation. For the graph support, we use either a regular 2D grid or either an inferred graph obtained by keeping the four neighbors that covary the most. In the second case, which correspond to scrambled CIFAR-$10$ (see \secref{sec:datasets}), no structure prior have been fed to the process, so the results can be compared with those of \cite{lin2015}. We also report results obtained with ChebNet~\citep{defferrard2016convolutional}, where only convolutional layers have been replaced for comparison. \tabref{tab:cifar-table} summarizes our results. In particular, it is interesting to note that results obtained without any structure prior (91.07\%) are only 2.7\% away from the baseline using classical CNNs on images (93.80\%). This gap is even smaller (less than 1\%) when using the grid prior. Also, on scrambled CIFAR-$10$ (case without prior) our method significantly outperforms the others.%% In the top left corner of the table we fully exploit the fact that the dataset is composed of 2D structured data, and in the bottom right corner this information is not used at all. Even with that disadvantage, the results differ of only 2.7\%, which shows the strenght of the method when compared to others.

\begin{table}[h!]
\begin{center}
\caption{CIFAR-$10$ and scrambled CIFAR-$10$ comparison table.}
\vspace{-0.4cm}
\label{tab:cifar-table}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{|l|c|c|c|c|c|}
\hline
\multirow{2}{*}{Support} & \multirow{2}{*}{MLP} %\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}MLP \\\citeauthor{lin2015}\end{tabular}}
& \multirow{2}{*}{CNN} & \multicolumn{2}{c|}{Grid Graph}       & Covariance Graph         \\ \cline{4-6} 
                         &                                                                                &                      & ChebNet$^c$ & Proposed & Proposed              \\ \hline
Full Data Augmentation   & 78.62\%$^{a,b}$                                                                     & \textbf{93.80\%}     & 85.13\%                            & 93.94\%  & 92.57\%                                         \\ \hline
Data Augmentation - Flip & ------                                                                         & 92.73\%              & 84.41\%                            & 92.94\%  & 91.29\%                                         \\ \hline
Graph Data Augmentation  & ------                                                                         & 92.10\%$^d$          & ------                             & 92.81\%  & \textbf{91.07\%}$^a$                             \\ \hline
None                     & 69.62\%                                                                        & 87.78\%              & ------                             & 88.83\%  & 85.88\%$^a$                                      \\\hline
\end{tabular}

}
%\vspace{-0.2cm}
\end{center}
\begin{flushleft}
\footnotesize{
$^a$ No prior about the structure\\
$^b$ \cite{lin2015}\\
$^c$ \cite{defferrard2016convolutional}\\
$^d$ Data augmentation done with covariance graph 
}
\end{flushleft}
\end{table}

\paragraph{Experiments on a fMRI dataset}
We used a shallow network on the PINES dataset (see \secref{sec:datasets}). The results reported on \tabref{tab:iaps-table} show that our method was able to improve over CNNs, MLPs and other graph-based extended convolutional neural networks.

\begin{table}[h!]
\centering
\caption{PINES fMRI comparison table.}
\label{tab:iaps-table}
\begin{tabular}{|l||c|c||c|c|}
\hline
\multicolumn{1}{|l||}{Support} & \multicolumn{2}{c||}{None} & \multicolumn{2}{c|}{Neighborhood Graph}     \\ \hline
Method                      & MLP & CNN (1x1 kernels)                                & ChebNet$^c$ & Proposed                   \\ \hline
Accuracy                    & 82.62\% & 84.30\%                            & 82.80\%                            & \textbf{85.08\%} \\ \hline
\end{tabular}
%\vspace{-.4cm}
\end{table}