\documentclass[11pt]{article}
\usepackage{amsmath,amsthm,amsfonts,amssymb}
\newtheorem{defn}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{conj}{Conjecture}
\newtheorem{lemma}{Lemma}
\newtheorem{cor}{Corollary}
\newtheorem{claim}{Claim}
\newtheorem{question}{Question}
\newtheorem{fact}{Fact}
\newtheorem{example}{Example}
\newtheorem{note}{Note}
\newcommand\To{\Rightarrow}
\newcommand\from{\leftarrow}
\newcommand\fromr{\stackrel{R}{\from}}
\renewcommand\b[1]{\textbf{#1}}
\renewcommand\t[1]{\text{#1}}
\renewcommand\c[1]{\mathcal{#1}}
\newcommand\cond{\mathrel{|}}
\input{preamble-light}
\begin{document}
\lecture{4}{February 13, 1997}{Daniel A. Spielman}{Victor Boyko}
\section{Parallel computation}
Today we will consider a simple model for parallel computation. The
model is not of much use in practice, since it is extremely
fine-grained. It is, however, convenient to consider theoretically.
We will use boolean circuits, with each gate executing in parallel.
Each gate will have 2 inputs. The underlying directed graph must be
acyclic. We will usually be concerned with two characteristics of
a circuit:
\begin{description}
\item[Size] total number of gates.
\item[Depth] the length of the longest path from an input to an output.
\end{description}
Intuitively, the size of the circuit corresponds to the
amount of work performed,
while the depth corresponds to running time. Note that we
will not be concerned whether the circuit can be efficiently embedded
in 3D space.
\begin{defn}%
\footnote{Note that we will change this definition later in
the lecture}
[\b{NC}, \b{FNC}]
A language $L$ is in $\b{NC}^k$ if there exists a family of circuits
$\c{C} = \{c_n\}_{n=0}^\infty$, polynomial $p(n)$,
and a function $d(n) =
O(\log^k n)$, such that for each $n$ $c_n$ has
\begin{itemize}
\item exactly $n$ inputs
\item exactly one output
\item size at most $p(n)$
\item depth at most $d(n)$
\end{itemize}
and accepts $L\cap \{0,1\}^n$.
$\b{NC} = \bigcup_k \b{NC}^k$
$\b{FNC}$ is the class of functions computable by such families
circuits (i.e., the difference is that circuits will have multiple
outputs instead of just one).
\end{defn}
\b{NC} is is an abbreviation for ``Nick's class,'' named after Nick
Pippinger.% , a student of Peter Elias.
\subsection{Examples}
\begin{example}
Matrix multiplication (over $\t{GF}(2)$, for simplicity) is in
\b{FNC}. The circuits will have depth $\log n$ and size $2 n^3$.
The formula for elements of the resulting matrix is as follows:
\begin{equation*}
c_{i,j} = \sum_{k=1}^n a_{i,k} b_{k,j}.
\end{equation*}
\begin{figure}[htbp]
\begin{center}
\leavevmode
\setlength{\unitlength}{0.00083300in}%
%
\begingroup\makeatletter\ifx\SetFigFont\undefined
% extract first six characters in \fmtname
\def\x#1#2#3#4#5#6#7\relax{\def\x{#1#2#3#4#5#6}}%
\expandafter\x\fmtname xxxxxx\relax \def\y{splain}%
\ifx\x\y % LaTeX or SliTeX?
\gdef\SetFigFont#1#2#3{%
\ifnum #1<17\tiny\else \ifnum #1<20\small\else
\ifnum #1<24\normalsize\else \ifnum #1<29\large\else
\ifnum #1<34\Large\else \ifnum #1<41\LARGE\else
\huge\fi\fi\fi\fi\fi\fi
\csname #3\endcsname}%
\else
\gdef\SetFigFont#1#2#3{\begingroup
\count@#1\relax \ifnum 25<\count@\count@25\fi
\def\x{\endgroup\@setsize\SetFigFont{#2pt}}%
\expandafter\x
\csname \romannumeral\the\count@ pt\expandafter\endcsname
\csname @\romannumeral\the\count@ pt\endcsname
\csname #3\endcsname}%
\fi
\fi\endgroup
\begin{picture}(3324,2370)(289,-2485)
\thicklines
\put(1051,-1411){\circle{300}}
\put(946,-1516){\line(-1,-1){235}}
\put(1156,-1516){\line( 1,-1){240}}
\put(976,-1411){\line( 1, 0){150}}
\put(1051,-1336){\line( 0,-1){150}}
\put(2851,-1411){\circle{300}}
\put(2746,-1516){\line(-1,-1){235}}
\put(2956,-1516){\line( 1,-1){240}}
\put(2776,-1411){\line( 1, 0){150}}
\put(2851,-1336){\line( 0,-1){150}}
\put(3301,-1861){\circle{300}}
\put(2401,-1861){\circle{300}}
\put(1501,-1861){\circle{300}}
\put(1951,-511){\circle{300}}
\put(601,-1861){\circle{300}}
\multiput(3226,-1936)(6.00000,6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\multiput(3226,-1786)(6.00000,-6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\put(3001,-2311){\line( 3, 4){237}}
\put(3601,-2311){\line(-3, 4){234}}
\multiput(2326,-1936)(6.00000,6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\multiput(2326,-1786)(6.00000,-6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\put(2101,-2311){\line( 3, 4){240}}
\put(2701,-2311){\line(-3, 4){234}}
\multiput(1426,-1936)(6.00000,6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\multiput(1426,-1786)(6.00000,-6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\put(1201,-2311){\line( 3, 4){237}}
\put(1801,-2311){\line(-3, 4){240}}
\put(1876,-511){\line( 1, 0){150}}
\put(1951,-436){\line( 0,-1){150}}
\put(1156,-1306){\line( 1, 1){690}}
\put(2056,-616){\line( 1,-1){690}}
\multiput(526,-1936)(6.00000,6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\multiput(526,-1786)(6.00000,-6.00000){26}{\makebox(6.6667,10.0000){\SetFigFont{7}{8.4}{rm}.}}
\put(301,-2311){\line( 3, 4){237}}
\put(901,-2311){\line(-3, 4){240}}
\put(1951,-361){\line( 0, 1){150}}
\put(3001,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$a_{i,4}$}}}
\put(3601,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$b_{4,j}$}}}
\put(2101,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$a_{i,3}$}}}
\put(2701,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$b_{3,j}$}}}
\put(1201,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$a_{i,2}$}}}
\put(1801,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$b_{2,j}$}}}
\put(301,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$a_{i,1}$}}}
\put(901,-2461){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$b_{1,j}$}}}
\put(1951,-211){\makebox(0,0)[lb]{\smash{\SetFigFont{8}{9.6}{rm}$c_{i,j}$}}}
\end{picture}
\caption{Matrix multiplication for $n=4$}
\label{fig:tree}
\end{center}
\end{figure}
We can compute each $c_{i,j}$ in depth $\log n$: Compute
$a_{i,k} b_{k,j}$ for each $k$ in parallel. Then, to add these $n$
numbers we make a tree of depth $\log n$ (see fig.~\ref{fig:tree}
for an example). We will use $2 n$ nodes in each tree. Total is $2
n^3$ gates.
In practice, we usually don't have $2 n^3$ processors. We can adapt
this scheme for coarse-grained parallelism by having each processor
simulate several gates.
\end{example}
\begin{example}
Graph reachability is in $\b{NC}$.
The problem of graph reachability is as follows: We are given a
directed graph as input, which has two special nodes $s$ and $t$.
Does this graph contain a path from $s$ to $t$? The standard search
algorithm does not parallelize well: Suppose the graph is one long
path from $s$ to $t$ with $n$ nodes. Then it will take $n$ steps to
get from $s$ to $t$. We can, however, use matrix multiplication to
solve this problem.
Let's make a new matrix product, in which $c_{i,j} = \bigvee_k
a_{i,k} \wedge b_{k,j}$. Say $A$ and $B$ are adjacency matrices
(which have self-loops for all nodes). Then $c_{i,j}$ is 1 if it is
possible go from $i$ to $j$ by taking a step in $A$ followed by a
step in $B$. We want to compute the transitive closure of our graph
$G$, $G^n$. We will compute it by repeated squaring. Thus we will
have $\log n$ multiplications, each with $n^3$ size and $\log n$
depth.
The resulting circuit has $n^3 \log n$ size and $\log^2 n$ depth.
\end{example}
It has been shown last semester that directed graph reachability is
complete for $\b{NL}$. Therefore, we have
\begin{cor}
$\b{NL} \subseteq \b{NC}$.
\end{cor}
\begin{example}
Some problems are known to be in $\b{NC}$, but not in a practical
fashion. For instance, determinant is in \b{NC} but
the best \b{NC} algorithm for it has size $n^8$ and depth
$\log^4 n$.
\end{example}
\subsection{Log-space mapping reduction}
\begin{defn}[log-space mapping reduction]
$A\leq_{\log} B$ if $\exists$ a log-space computable $f$ s.t.\ $x\in
A$ iff $f(x)\in B$.
\end{defn}
Note: in order to make sense of this definition, we need to have a
read-only input tape and a write-only output tape.
It has been shown last semester that
\begin{lemma}
If $B\in \b{LOGSPACE}$ and $A\leq_{\log} B$, then $A\in \b{LOGSPACE}$.
\end{lemma}
We'll re-define \b{NC} to be logspace-uniform: in addition to the properties
in the definition above, require that there exist a space $O(\log^k
n)$ machine $M$ s.t.\ on input $1^n$, $M$ outputs $c_n$.
One consequence of logspace-uniformity is
\begin{lemma}
$\b{NC}^k\subseteq \b{SPACE}(\log^k n)$
\end{lemma}
Note that this might be false if we used polytime-uniformity (i.e.,
$M$ was only required to be polytime).
\begin{proof}
The circuit can be generated in logspace by definition. We can then
evaluate the circuit using DFS starting from the top. The whole
circuit never needs to be stored, since whenever we need to find out
children of a gate, we simulate the machine that generates the
circuit until it gets to that point. This mode of evaluation could take up to
$2^{\log^k n}$ time. The space needed is just the size of the
stack, which is the depth of the circuit.
\end{proof}
It is not known whether $\b{NC}^k = \b{SPACE}((\log n)^k)$. That would
be unlikely, however, since $\b{NC}\subseteq \b{P}$ and it is unlikely
that $\b{SPACE}((\log n)^k) \subseteq \b{P}$.
\subsection{$\b{NC}$ vs.\ $\b{P}$}
The big question in the theory of parallel computation is whether
$\b{NC} = \b{P}$. There exist problems that are $\b{P}$-complete under
logspace reductions. (Note that all problems presently known
to be $\b{NP}$-complete are complete under logspace reductions.)
\subsubsection{Circuit value problem}
The circuit value problem CVAL is, given a circuit $C$ and inputs
$x_1$, \ldots, $x_n$, output $C(x_1, \ldots, x_n)$.
\begin{theorem}
CVAL is $\b{P}$-complete under logspace reductions.
\end{theorem}
\begin{proof}
Obviously, the problem is in $\b{P}$. To show that CVAL is
$\b{P}$-complete means for every TM $M$ being able to construct a
logspace computable function that takes as input $\bar{x}$, and
outputs a circuit $C$ and an input $\bar{y}$ s.t.\ $C(\bar{y})$
accepts iff $M$ accepts $x$ in time $t$.
Let $t(n)$ be a bound on the running time of $M$. We will make a table
$c$, where $c_{i,t}$ is the contents of cell $i$ at time $t$.
$c_{i,t}$ only depends on $c_{i-1,t}$, $c_{i,t}$, and $c_{i+1,t}$. We
will have for each $i$ and $t$ a circuit segment (of constant size)
which computes $c_{i,t}$ from those three inputs. Our machine will be
able to output the description of the whole circuit $C$ by going
through a loop over all the cells. $\bar{y}$ is $\bar{x}$ extended
with blanks.
Thus, the circuit produced from a $t(n)$-time machine will have size
$(t(n))^2$.\footnote{It is possible to get a circuit of size $t(n)
\log t(n)$, but that is tricky.}
\end{proof}
\end{document}