\documentstyle[twocolumn]{article}
\def\for{\hbox{ for }}
\def\sec{\hbox{ section}}
\def\andd{\hbox{ and }}
\def\with{\hbox{ with }}

%%%\pagestyle{empty}
\setlength{\topmargin}{-0.60in}
\setlength{\textheight}{9.0in}
\setlength{\oddsidemargin}{-0.2in}
%%\renewcommand{\baselinestretch}{1.05} %%%%%
\setlength{\evensidemargin}{-0.20in}
\setlength{\textwidth}{6.95in}
\setlength{\columnsep}{.33in}
\addtolength{\parskip}{2pt}
\addtolength{\itemsep}{0.1in}
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\newtheorem{claim}{Claim}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{corollary}[theorem]{Corollary}
\newcommand{\qed}{\mbox{\ \ \ }\rule{6pt}{7pt} \bigskip}
\newcommand{\mathqed}{\mbox{\ \ \ \rule{6pt}{7pt}}}
\newcommand{\orr}{\vee}
\newcommand{\D}{{\cal D}}
\newcommand{\half}{\frac{1}{2}}
\newcommand{\proj}{{\em proj}}
\newcommand{\R}{{\bf R}}
%\newcommand{\erx}{\E_{R}[\; |x|^2\;]}
%\newcommand{\ebx}{\E_{\Bn}[\; |x|^2\;]}
\newcommand{\erx}{\E_{R}[ |x|^2]}
\newcommand{\ebx}{\E_{\Bn}[|x|^2]}
\newcommand{\tC}{\tilde{C}}
\newcommand{\prob}{{\bf Pr}}
\newcommand{\ir}{{\rm irrel}}
\newcommand{\bA}{{\bf A}}
\newcommand{\bB}{{\bf B}}
\newcommand{\bv}{{\bf v}}
\newcommand{\rel}{{\rm rel}}
\newcommand{\sfrac}[2]{{\textstyle \frac{#1}{#2}}}
\newcommand{\V}{{\bf V}}
\newcommand{\W}{{\bf W}}
\renewcommand{\v}{\vec{v}}
\newcommand{\virrel}{{\bf V}_{\rm irrel}}
\newcommand{\vrel}{{\bf V}_{\rm rel}}
\newcommand{\shalf}{{\textstyle \frac{1}{2}}}
\newcommand{\sinv}[1]{\sfrac{1}{#1}}
\newcommand{\inv}[1]{\frac{1}{#1}}
\newcommand{\w}{\vec{w}}
\newcommand{\vx}{\vec{x}}
\newcommand{\halfspace}[1]{\w_{#1} \cdot \vec{x} \leq
a_{#1}}
\newcommand{\E}{{\bf E}}
\newcommand{\Var}{{\rm Var}}
\newcommand{\Bn}{B_n}
\newcommand{\vol}{{\rm vol}}
\newcommand{\upos}{\mu_{\rm pos}}
\newcommand{\ballvol}{\gamma_n}
\newcommand{\slice}{{\sf slice}}
\setlength{\marginparwidth}{-0.4in}
\newcommand{\mnote}[1]
    {\marginpar%
        [{\tiny\begin{minipage}[t]{\marginparwidth}\raggedright#1%
                        \end{minipage}}]%
        {\tiny\begin{minipage}[t]{\marginparwidth}\raggedright#1%
                        \end{minipage}}%
    }
\begin{document}
\renewcommand{\arraystretch}{0.5}
\bibliographystyle{plain}
\thispagestyle{empty}
\title{Learning an Intersection of $k$ Halfspaces over a Uniform Distribution}

\author{{\bf Avrim Blum}\thanks{Supported by an NSF Postdoctoral
Fellowship.  Address: School of Computer Science, Carnegie Mellon
University, Pittsburgh, PA 15213.  Email: avrim@theory.cs.cmu.edu}\\[0.05in]
CMU\\
\and 
{\bf Ravi Kannan}\thanks{Address: Bellcore, 445 South Street,
Morristown, NJ 07960.}\\[0.05in]
Bellcore and CMU
}
\date{}  %{\today}   %{}
\maketitle
%%%\thispagestyle{empty}
\begin{abstract}
We present a polynomial-time algorithm to learn an intersection of a
constant number of halfspaces in $n$ dimensions, over the uniform
distribution on an $n$-dimensional ball.  
The algorithm we present in fact can learn an intersection of
an arbitrary (polynomial) number of halfspaces over this distribution,
if the subspace spanned by the normal vectors to the bounding
hyperplanes has constant dimension.
This generalizes previous
results for this distribution, in particular a result of Baum
\cite{Baum90a} who showed how to learn an intersection of 2
halfspaces defined by hyperplanes that pass through the origin (his
results in fact held for a variety of symmetric 
distributions).  Our algorithm uses estimates of second moments to
find vectors in a 
We believe that the algorithmic techniques studied here may be
useful in other geometric learning applications.
\end{abstract}

\section{Introduction}
The class of 
intersections of halfspaces is a widely studied concept class in
machine learning theory (e.g., \cite{BlumerEhHaWa89,Baum90,Baum90a,BlumRi92}).
Not only are they quite natural 
geometrically, but they also correspond to functions computed by
simple neural networks.  Unfortunately, most of the results known on
the learnability of these concepts in high-dimensional spaces in
standard theoretical models have been negative.  In the Valiant
distribution-free
model, even an intersection of 2 halfspaces cannot be learned in
polynomial time in a representation 
dependent sense (the learner's hypothesis must also be an intersection
of 2 halfspaces) unless {\bf RP}$=${\bf NP} \cite{BlumRi92,Megiddo86}.
Some intuitive arguments for the difficulty of learning this class
in the representation-independent ``prediction'' model (the learner's
hypothesis may
be any polynomial-time prediction algorithm) are given by Baum \cite{Baum90}.
Also, Long and Warmuth \cite{LongWa90} have shown that the class of
convex polytopes given by their vertices is prediction complete for {\bf P}.

In restricted distribution models, however, there has been some
positive success.  In particular, Baum
\cite{Baum90a} showed that an intersection of two homogenous halfspaces
(the hyperplanes that define them must pass through the origin) is
learnable over any distribution $\D$ that is symmetric in the sense
that $\D(x) = \D(-x)$ for all $x$.

We consider here the uniform distribution on the $n$-dimensional unit
ball $\Bn$.  This is a quite natural specific distribution for this
problem.   What we present is an algorithm that can learn in a
probably-approximately-correct prediction sense, any intersection of a constant
number of halfspaces over this distribution.  We also do not require
that the hyperplanes bounding the positive region pass through the
origin.

The high-level idea that our approach is based upon is the following.
Suppose the halfspaces are $\halfspace{1},\; \halfspace{2},\; \ldots,\;
\halfspace{k}$.   Then,  even though examples are points in
$n$-dimensional space, all that really matters for their
classification are their projections onto the $(\leq k)$-dimensional
subspace spanned by the vectors $\w_1, \ldots, \w_k$. This is because
the value of $\w_i \cdot \vec{x}$ is entirely determined by that projection.
Let $\V =$ span$(\w_1, \ldots, \w_k)$.  If we
could somehow find $\V$,
say by finding a set of basis vectors {\em not necessarily the
$\w_i$'s}, we could project all our examples onto this space and
reduce the dimensionality of the learning problem to a constant.  We
then could learn halfspaces in the constant-dimensional space by
standard methods.  So, the idea is to
``focus attention'' on the low-dimensional relevant
subspace hidden in the high-dimensional input space.
Our algorithm will actually be somewhat different from this high-level
intuition, but this is the inspiration for our approach.

Our techniques in fact work for a more general class of concepts than
just intersections of a constant number of halfspaces.  We can
learn in polynomial time an intersection of an arbitrary (polynomially many)
number of halfspaces, so long as the dimension of the ``relevant
space'' $\V$ is constant.

To illustrate how one might hope to find a ``relevant
direction'' (a unit length vector in $\V$) consider the average position of
the positive examples $\upos = \E[ \vx : \vx \mbox{ is positive}]$.  The
vector $\upos$ can be found to $1/poly(n)$ accuracy by simply taking
the average of a large enough sample.  Now, suppose $\v$ is a
unit vector orthogonal to space $\V$.  Then, $\upos \cdot \v = 0$ because
the positive region is symmetric about reflection through the normal
hyperplane to $\v$. That is, for any positive example $\vx = \vx' + \vx_v$
where $\vx_v$ is the component of $\vx$ along direction $\v$, the example
$\vx' - \vx_v$ is also positive.  So, if $\upos$ is noticeably far from
the origin, then a good approximation to $\upos$ is a (good
approximation to) a non-zero vector in $\V$.

While the average $\upos$ is convenient and easy to examine,
unfortunately it might be zero.  Because of this, our main object of
study will in fact be the second moment.  Given a vector $\v$ of
length 1 and a set of points $S$, consider $\E_{\vx \in S}[(\v
\cdot \vx)^2]$.  This is the average squared distance of points $\vx$ from
the hyperplane $\v \cdot \vx = 0$. Given a set $S$, the direction $\v$
that minimizes  $\E_{\vx \in S}[(\v \cdot \vx)^2]$ can be found in
time polynomial in $|S|$ 
by a standard method of eigenvalue analysis (described in section
\ref{sec:find min}).
This is superficially like, but different than, the problem of finding a
hyperplane of least squared error. What we show (section \ref{sec:why
min}) is that
if $\upos$ is sufficiently close to zero, 
%%%plus a few technical conditions, 
then the direction $\v$ that minimizes  $\E_{\vx \in
pos}[(\v \cdot \vx)^2]$ 
(i.e. $\E[(\v \cdot \vx)^2]$ is taken over the uniform distribution on
positive examples $\vx$) is guaranteed to be a vector in $\V$;
moreover, if a few technical conditions are satisfied, then 
significant deviation from being in $\V$ noticeably
increases the second moment.  This argument turns out to be quite a bit more
difficult than the argument given above for the simple average of the
positive points.  We believe that this idea of examining directions of minimum
second moment should be useful more generally in other learning
situations. In fact, our use of minimizing second moments is similar
to the statistical technique of ``principal component analysis''
\cite{Morrison90}.  In that approach, one looks at directions that
maximize variance over unlabeled examples in order to gain information
about their distribution.  In the distribution we consider, all
directions have the same variance over the unlabeled examples, but by
examining the second moments of the positive (or negative) examples
separately, we are able to gain information about the function being
learned.

The above two paragraphs describe the basic idea for how we find one
(approximation to) a relevant direction.  We conjecture, but cannot
prove, that the unit vectors $\v_1, \ldots, \v_k$ where $\v_1 = v$
from before and $\v_i$ is the unit vector among those orthogonal to
$\v_1, \ldots, \v_{i-1}$ that minimizes $\E_{\vx \in
pos}[(\v_i \cdot \vx)^2]$, will span the relevant space (or an
equivalent space to $\V$ if some of the halfspaces are implied by
others). If true, and if vectors with ``noticeable'' ($1/poly$)
components in an irrelevant direction gave noticeably higher second
moment, then this would yield a quite simple learning algorithm.
Unfortunately, we must use a more complicated and less efficient
strategy.  After finding the first relevant direction, our algorithm
then continues 
by looking at ``slices'' perpendicular to
that direction, and employing a recursive approach.  The idea
here is that sufficiently thin slices can roughly be treated as 
$(n-1)$-dimensional balls, with a $(k-1)$-dimensional relevant
subspace.  In part due to the errors introduced in this approximation,
the end hypothesis produced by our algorithm will not be an
intersection of halfspaces, but rather a polynomial-time
prediction algorithm representing a union of hypotheses for each
slice.
The running time and number of examples needed by our procedure are
doubly exponential in $k$.

\iffalse
One complication that arises is that each
direction found is only approximate and has a small irrelevant
component.  Because we need only go $k$ levels deep in the recursion,
however, we  can avoid this issue by finding the first direction to
sufficiently high accuracy that no example in the subsequent portion
of the algorithm will be a witness to its being inexact.
Unfortunately, this brings the running time to doubly exponential in $k$.
We conjecture that this extreme accuracy is not needed, however, and
that the running time can be reduced to singly-exponential in $k$.  In
fact, we conjecture that a simpler (but apparently more difficult to
analyze) non-recursive approach should succeed.
\fi

\section{Notation, definitions, and preliminaries}
For our purposes, an {\em example} is a point in $\R^n$.  A {\em
concept} $P$ is a subset of $\R^n$, and an example $x$ is a positive
example of $P$ if $x \in P$ and is a negative example of $P$
otherwise (we use ``$P$'' to emphasize that the concept is the
set of {\em positive} examples). 
%% We also treat a concept as a boolean function, where $P(x) = 1$ if
%% $x \in P$ and $P(x) = 0$ otherwise.  
A {\em labeled example} is an example together with its
label (positive or negative). Our goal is to learn an unknown target
concept from labeled examples.

In this paper, we assume that the target concept is an intersection of
$\ell$ halfspaces in $n$-dimensional space, where the normal vectors to
the bounding hyperplanes of the positive region span a space of a
constant dimension $k$. 
Let $\Bn$ be the $n$-dimensional ball of radius 1, and let $\ballvol$
denote the volume of $\Bn$.  We assume that the examples
are drawn from the uniform distribution over  $B_n$.  Thus, we can define the 
positive region $P$ to be the intersection of the $\ell$ halfspaces and
$\Bn$.  The error of a hypothesis $H$ with respect to the target
concept $P$, for this distribution, is just $\vol((H
\Delta P) \cap \Bn)/ \vol(\Bn)$, where $\vol(R)$ denotes the volume of region $R$.  As
is standard in 
computational learning theory, our algorithm is given an error
parameter $\epsilon$ and a confidence parameter $\delta$ and the goal
of the algorithm is to produce with probability at least $1-\delta$ a
hypothesis with error at most $\epsilon$.  
%%%%We will tend to be casual in our dealings with $\delta$ since the
%%%%confidence parameter can be easily boosted in standard ways.  
We require the hypothesis to be
polynomial-time computable (given a point $x$ one can determine
whether $x$ is a positive or negative example of the hypothesis in
polynomial time) but do not require it to be represented as an
intersection of halfspaces.

%% Let $r \leq k$ be the dimension of the ``relevant space'' $\V$ and
%% let $v_{r+1}, \ldots, v_n$ be mutually orthogonal directions all
%% orthogonal to $\V$ (i.e., these are a basis for $\V^\perp$).

If $c$ is a scalar, then 
$c\Bn$ is the $n$-dimensional ball of radius $c$. 
It is not hard to see that $c\Bn$ has
volume $c^n\ballvol$.  
%%\footnote{Technically we can only examine $poly(n)$ bits of
%%precision of each example, but we can safely ignore this issue as it
%%produces only insignificant errors.} 
If $R$ is some region and $A$ is 
a (real or vector) function of points $x$, we
write $\E_{x \in R}[A]$ to denote the expectation of $A$ given that
$x$ is chosen uniformly from region $R$.  We will sometimes abbreviate
this to $\E_R[A]$.  Of course, we cannot calculate such quantities
exactly; we can only approximate through repeated observations.  If $S$ is a
sample of points, then $\E_S[A]$ is the average of $A$ over sample
$S$.

We treat examples both as points and as vectors, and use $|x|$ to
denote the {\em length} of $x$, which is its ($L_2$) distance to the
origin (its length as a vector).  A {\em direction} is a vector of
length 1.  A point $x$ is {\em $\tau$-central} to a region $R$ if a
ball of radius $\tau$ about $x$ is contained in $R$. The {\em mean} or
{\em center of gravity} of a region $R$ is $\E_{x \in R}[x]$.

Suppose $R = \{c \in Bn : \w_i \cdot x \leq a_i, \for
i=1,2,\ldots,l\}$ and none of the constraints $\w_i\cdot x\leq a_i$ is
redundant (i.e., dropping any constraint strictly enlarges $R$). Then,
the {\em relevant subspace of $R$}, denoted $\vrel(R)$, is the span of
$\w_1, \ldots, \w_l$. The {\em irrelevant subspace of $R$}, denoted
$\virrel(R)$, is the orthogonal complement of $\vrel(R)$. I.e.,
$\virrel(R)$ is the collection of all vectors orthogonal to
$\vrel(R)$. Vectors in $\vrel(R)$ are called {\em relevant} vectors
and vectors in $\virrel(R)$ are called {\em irrelevant} vectors.  More
generally, if $R$ is some arbitrary region in $\Bn$, we say that a
vector $\v$ is irrelevant to $R$ if for all pairs of points $x,x' \in
\Bn$ where $x' = x + c\v$ for some scalar $c$, either both $x$ and
$x'$ are in $R$ or both are not.  Then, $\virrel(R)$ is the span of
all irrelevant vectors $\v$ for $R$.
It is not too hard to see that this
definition implies that every vector in $\virrel(R)$ is irrelevant,
and that this definition is consistent with the previous definition of
$\virrel(R)$. Again, $\vrel(R)$ is the orthogonal complement of
$\virrel(R)$.
%%%I.e., $\vrel(R)$ is the collection of all vectors orthogonal to the
%%%space $\virrel(R)$.  
Notice that non-irrelevant vectors are not necessarily relevant.
For example, if $R = \{x : x_1 \geq 0, x_2 \geq 0\}$ then
any vector with zero first and second components is irrelevant, and
$\vrel(R)$ is the span of the coordinate vectors $\hat{x}_1$ and
$\hat{x}_2$.  The vector $(1,1,1,0,\ldots,0)$ is not irrelevant but
also not is in $\vrel(R)$. 

If $\W$ is a subspace of $\R^n$ and $x \in \R^n$, let $\proj(x,\W)$ be
the projection of $x$ onto $\W$.  Given a set $R$, we call
$\proj(x,\vrel(R))$ the ``relevant component'' of $x$ and
$\proj(x,\virrel(R))$ the ``irrelevant component.''  Also, we denote
the dimension of a space $\W$ by $\dim(\W)$.  So,
$\dim(\vrel(P)) = k$.
Finally, if $R$ is a convex region containing the origin, a ``cone''
in $R$ is the convex hull of the origin and some patch of the surface
of $R$.  An infinitesimally-small cone is the convex hull of the
origin and some infinitesimal patch $\partial R$ of the surface of $R$.

We will make frequent use of the following probabilistic inequalities
(Hoeffding and Chernoff bounds).  Let $X_1, \ldots, X_m$ be a sequence
of $m$ independent $\{0,1\}$ random variables with $\E[X_i] = p$, and
let $S$ be their sum.  Then, for $0 \leq \nu \leq 1$, the
following inequalities hold: 
\begin{itemize}
\item $\prob[S > (1 + \nu)pm] \leq e^{-\nu^2pm/3}$,  
\item $\prob[S < (1 - \nu)pm] \leq e^{-\nu^2pm/2}$, and 
\item $\prob[|\frac{S}{m} - p| > \nu] \leq 2e^{-2m\nu^2}$.
\end{itemize}
The last inequality holds even if the random variables $X_i$ are
identically distributed in $[0,1]$.

\section{Finding one nearly relevant vector}\label{sec:find one}

Let $\V = \vrel(P)$.  We may assume that
$\frac{\epsilon}{2}\ballvol \leq \vol(P) 
\leq (1 - \frac{\epsilon}{2})\ballvol$
since otherwise, by sampling we can notice that ``all positive'' or
``all negative'' are sufficiently close hypotheses. 
We show here how given $0 < \alpha < 1$, with probability at least $1-\delta$
we can find
a direction (unit vector) $v$ such that $|\proj(v,\virrel(P))|
< \alpha$, in time $poly(n,1/\alpha,1/\epsilon, \log(1/\delta))$.  
The algorithm itself is fairly simple, though the analysis takes
a bit of work.

The algorithm is this. We first select a large sample $S$ of positive
examples and find the mean $\mu_S$ of $S$.  If $|\mu_S| \geq 
\epsilon/(16\sqrt{n}(n+1))$
we return $\mu_S/|\mu_S|$ as our output.  Otherwise, we draw a large
sample $S'$ of positive examples and compute the direction $v$ that
minimizes $\sum_{x \in S'}(v \cdot x)^2$.  This computation can be
done by finding the eigenvector of least eigenvalue for a matrix
defined by the points in $S'$, as described in Section \ref{sec:find
min} below. We then return this vector $v$ as our output.  We prove
that if $S$ has size $O(\frac{n^4}{\epsilon^2\alpha^2} 
\log(n/\delta))$ and $S'$ has size
$O(\frac{n^7}{\epsilon^6\alpha^4} \log(n/\delta))$, then the unit vector
produced will have with high probability a sufficiently small
component in the irrelevant space as desired. 

The first (easier) part of the argument concerns the goodness of $\mu_S$.
Let $\upos$ denote the mean
of $P$. Since the ball $\Bn$ is a bounded region, Hoeffding bounds
imply that $\mu_S$ and $\upos$ will likely be close in each coordinate
direction, and therefore will be close overall, if $S$ is sufficiently
large.  In particular, we select $S$ of size
$O(\frac{n^4}{\epsilon^2\alpha^2} \log(n/\delta))$ 
so that with
probability $\geq 1-\delta/2$ the observed mean $\mu_S$ is within
$\epsilon\alpha/(16\sqrt{n}(n+1))$ of $\upos$. We assume in the
following that this holds.

As discussed in the introduction, $\upos$ lies in $\V$. So, if
$|\mu_S|$ is at least $\frac{\epsilon}{16\sqrt{n}(n+1)}$, the vector
$\mu_S/|\mu_S|$ is a unit vector with at most an $\alpha$ component in
the $\virrel(P)$ space as desired and we are done.  On the other hand,
suppose $|\mu_S| \leq \frac{\epsilon}{16\sqrt{n} (n+1)}$.  In this
case as described above, we draw a large sample $S'$ of positive
examples and compute the unit vector $v$ that minimizes $\sum_{x \in
S'}(v \cdot x)^2$.  We show that the vector found will have the
desired properties through a corollary to a main theorem we prove in
section
\ref{sec:why min} and a technical lemma.  We present and use the lemma
and corollary here leaving their proofs to Section \ref{sec:why min}
and the appendix.
\begin{lemma}\label{tau central}
If $R \subseteq \Bn$ is convex and has volume $r\ballvol$, then the
mean $\mu_R$ of $R$ is $(\frac{r}{2\sqrt{n}(n+1)})$-central to $R$.
\end{lemma}
{\bf Proof:} See appendix. \qed

\begin{corollary} \label{cor1} {\em [Corollary to Theorem \ref{moment
theorem} of Section \ref{sec:why min}] } 
Let $R \subseteq \Bn$ be a convex set with volume $r\ballvol$, and
suppose the origin is $\tau$-central in $R$. Let
$v$ be the unit vector such that $\E_R[ (v \cdot x)^2]$ is minimized,
and $w$ any other unit vector with $t = |\proj(w,\virrel(R))|$.
Then:
\begin{enumerate}
\item $v \in \vrel(R)$.

\item $\E_R[ (w \cdot x)^2] \; \geq \; \E_R[ (v \cdot x)^2] + \sfrac{t^2
n \Delta}{k(n+2)}.$
\end{enumerate}
where $k = \dim(\vrel(R))$; $\Delta = r(\frac{\tau}{8n^2})$ if $r \leq
\half$, and $\Delta = \frac{(1-r)^2}{r}(\frac{\tau}{8n^2})$ if $r > \half$.
\end{corollary}
{\bf Proof: } See Section \ref{sec:why min}. \qed

\smallskip
\noindent
Now, since $|\mu_S| \leq \frac{\epsilon}{16\sqrt{n}(n+1)}$, and $\mu_S$ is
within $\frac{\epsilon\alpha}{16\sqrt{n}(n+1)}$ of $\upos$, we have
that $|\upos| \leq \frac{\epsilon}{8\sqrt{n}(n+1)}$.  By Lemma
\ref{tau central} and our conditions on $\vol(P)$, this means:
the origin is $\tau$-central in $P$ for $\tau \geq
 \frac{(\epsilon/2)}{2\sqrt{n}(n+1)} -
\frac{\epsilon}{8\sqrt{n}(n+1)} = \frac{\epsilon}{8\sqrt{n}(n+1)}.$
So, Corollary \ref{cor1} implies that if $|\proj(w,
\virrel(R))| \geq \alpha$ then,
\begin{eqnarray*}
\E_P[(w \cdot x)^2] & \geq & \E_P[(v \cdot x)^2] + \\
& & \left[\sfrac{\alpha^2 n}{k(n+2)} \right] \left( (\sfrac{\epsilon}{2})^2
\sfrac{\epsilon}{8\sqrt{n}(n+1)} \right) \left(\sfrac{1}{8n^2}\right)\\
& = & \E_P[(v \cdot x)^2] + f, \\
& & \mbox{for } f =
\sfrac{\alpha^2\epsilon^3}{256kn^{3/2}(n+1)(n+2)}.
\end{eqnarray*}
We now choose $S'$ sufficiently large so that with probability
$1-\delta/2$, every unit vector $w$ has the observed $\E_{S'}[(w \cdot
x)^2]$ within $f/3$ of the true $\E_{P}[(w \cdot x)^2]$.  Note that
since $\E[(w \cdot x)^2] = \E[\sum_{i,j} w_i w_j x_i x_j] =
\sum_{i,j}\E[x_i x_j]$, it is enough to have each observed
$\E[x_ix_j]$ close to its true value, where $x_i$ and $x_j$ are the
components of $x$ in the $i$th and $j$th coordinates.  Therefore, by
Hoeffding bounds, this will hold if $S'$ has size
$O((n^7/\epsilon^6\alpha^4) \log(n^2/\delta))$.

So, if $v$ is the vector that minimizes $\E_P[(v \cdot x)^2]$ and $w$
is a vector with  $|\proj(w, \virrel(R))|$ $\geq$ $\alpha$, then
\begin{eqnarray*}
\E_{S'}[(w \cdot x)^2] & \geq &  \E_{P}[(w \cdot x)^2] - f/3 \\
& \geq & \E_{P}[(v \cdot x)^2] + 2f/3 \\
& \geq & \E_{S'}[(v \cdot x)^2] + f/3.
\end{eqnarray*}
Thus, no such direction will be found by minimizing the observed
second moment, which is what we wanted.
So, we have:
\begin{theorem}
There is an algorithm that given
$0 < \delta,\epsilon,\alpha < 1$, in time $poly(n,\inv{\alpha},
\inv{\epsilon}, \log(1/\delta))$ finds a 
direction $v$ such that if $\frac{\epsilon}{2}\ballvol \leq \vol(P) \leq
(\frac{1-\epsilon}{2}) \ballvol$, then with probability $\geq
1-\delta$,  $|\proj(v,\virrel(P))| < \alpha$. 
\end{theorem}

\subsection{How to Minimize second moment}\label{sec:find min}

Given a finite set of points $S$, we want to find a non-zero vector $\v$ that
minimizes $$ \sum_{\vx \in S}\frac{(\v \cdot
\vx)^2}{|\v|^2}.$$
%%
%%\noindent
A standard method for doing this is as follows \cite{Morrison90}.
Let $\bA$ be a matrix with the points in $S$ as row vectors, and let
$\bB$ = $\bA^{T}\bA$.  So, equivalently, we want the column vector
$\bv$ such that $\bv^{T}\bB\bv/(\bv^T\bv)$ is minimized.  This is
just the eigenvector of $\bB$ having the least eigenvalue, for the
following reason.

By definition, $\bB$ is a symmetric matrix, and so it is a
standard linear-algebra fact that $\bB$ has an orthonormal basis of
eigenvectors. Also, by definition of $\bB$, all the eigenvalues of
$\bB$ are real and non-negative. Say $\bv_1, \ldots, \bv_n$ form
an orthonormal basis of eigenvectors for $\bB$, and that
$\bB \bv_i = \lambda_i\bv_i$.  So, for any column vector $\bv$, we can
write $\bv = \sum_i c_i\bv_i$ for some scalars $c_i$.  Thus,
$$\frac{\bv^{T}\bB\bv}{\bv^T\bv} \; = \; \frac{(\sum_i c_i
\bv_i^T)(\sum_i c_i \lambda_i \bv_i)}{\sum_i c_i^2} \; = \;
\frac{\sum_i \lambda_i c_i^2}{\sum_i c_i^2}.$$
This is clearly minimized when $c_i = 1$ for $\lambda_i$ having the
least eigenvalue, and $c_{i'} = 0$ for $i' \neq i$.

Thus, all we must do is find the eigenvector of least eigenvalue for
an $n$ by $n$ matrix, and this can be done to as many bits of
precision as desired in polynomial time by standard techniques.


\subsection{Why the minimum second moment
direction is useful}\label{sec:why min}
In this section we show why the direction of minimum second moment
lies in the relevant space.  The high level idea is that the planes
``constrict'' the positive region in relevant directions but not in
irrelevant ones.  The theorems we show, at least in rough form, are
fairly intuitive for low dimensional spaces, but seem to require quite
a bit of work to prove in $n$ dimensions.

In the following theorems and lemmas, $R \subseteq \Bn$ is a convex region
which one can think of as the positive region $P$.  Given an example
$x$, we define $\rel(x) = \proj(x,\vrel(R))$ and $\ir(x) =
\proj(x,\virrel(R))$. 

\begin{theorem}\label{moment theorem}
Let $R \subseteq \Bn$ be a convex set and suppose that the
origin is $\tau$-central in $R$. Let
$v$ be the unit vector in $\vrel$ such that $\E_R[ (v \cdot x)^2]$ is
minimized. Then, for any unit vector $w \in \virrel(R)$, we have:
\begin{eqnarray*}
\E_{R}\left[ (w \cdot x)^2 \right] &  \geq &  \E_R\left[(v \cdot x)^2\right] + \frac{n\Delta}{k(n+2)}
\end{eqnarray*}
where $\Delta$ is as defined in Corollary \ref{cor1} and $k =
\dim(\vrel(R))$.
\end{theorem}
We prove this theorem through a sequence of lemmas, the main one being
Lemma \ref{vol-lemma}.  We first prove Corollary \ref{cor1}
given the theorem, then state and prove the lemmas, and then finally
prove the theorem.

\noindent
{\bf Proof of Corollary \ref{cor1}. }  Let $v$ be the direction in
$\vrel(R)$ such that $\E_R[ (v \cdot x)^2]$ is minimized (over
directions in $\vrel(R)$).  We need only prove claim (2) for this $v$ as
claim (1) follows.  For direction $w$, let $w = w' + w''$ where $w'
\in \vrel(R)$ and $w'' \in \virrel(R)$.  So, $\E_R[(w \cdot x)^2] =
\E_R[(w' \cdot x + w''\cdot x)^2]  =
\E_R[(w' \cdot x)^2] + \E_R[(w'' \cdot x)^2] + 2\E_R[(w' \cdot x)(w''
\cdot x)]$.  The crossterm above equals zero because by definition of
$\virrel(R)$, for each value of $c$ we have $\E_{x \in R, w' \cdot x =
c}[w'' \cdot x] = 0$.  So, by Theorem \ref{moment 
theorem} (and recalling that $|w''|^2 = t^2$ and $|w'|^2 = 1 -
t^2$) we have:
\begin{eqnarray*}
\E_R[(w \cdot x)^2] &  \geq & (1 - t^2) \E_R[ (v \cdot x)^2] + \\
& & t^2\left[\E_R[(v \cdot x)^2] + \sfrac{n\Delta}{k(n+2)}\right] \\
& \geq & \E_R[(v \cdot x)^2] + \sfrac{t^2 n \Delta}{k(n+2)}.
\mathqed
\end{eqnarray*}


\begin{lemma}\label{ball-lemma}
$\ebx = \frac{n}{n+2}.$  Note, by symmetry, this implies
that for any $v$ of length 1, $\E_{x \in \Bn}[(v \cdot x)^2] =
\inv{n+2}$. 
\end{lemma}
{\bf Proof. }  Straightforward calculation. 
$\displaystyle \ebx \;=\; \inv{\ballvol} \int_{r=0}^{r=1} r^2
(\ballvol n r^{n-1}) dr \;=\; \frac{n}{n+2}. \mathqed$

\begin{lemma}\label{at-least-lemma}
If $R$ is a convex region in $\Bn$ that contains the origin, then
\begin{eqnarray*}
\E_{x \in R}[ |\ir(x)|^2 ] & \geq & \E_{x \in \Bn}[ |\ir(x)|^2 ].
\end{eqnarray*}
\end{lemma}


\noindent
{\bf Proof. } See appendix.



The next (and main) lemma shows that as
long as there is a reasonable fraction of both positive and negative
examples and the positive region contains a small ball about the
origin, the average squared length of a positive example  is
less by some noticeable amount than the average squared length of a
point in $\Bn$ (and therefore is less by some noticeable amount that the
average squared length of a negative example). 

\begin{lemma}\label{vol-lemma}
Let $R \subseteq \Bn$ be a convex set and suppose that the
origin is $\tau$-central in $R$.  Let $r = \vol(R)/\vol(\Bn)$.  Then,
\begin{eqnarray*}
\E_R\left[\;|x|^2\;\right] & \leq & \E_{\Bn}\left[\;|x|^2\;\right](1 - \Delta)
\end{eqnarray*}
where $\Delta = r(\frac{\tau}{8n^2})$ if $r \leq \half$, and $\Delta =
\frac{(1-r)^2}{r}(\frac{\tau}{8n^2})$ if $r > \half$.
\end{lemma}

\noindent
{\bf Proof} (of Lemma \ref{vol-lemma}). \ \  The basic idea of the proof is
as follows.  Imagine
breaking up $\Bn$ into a union of infinitesimally small cones.
Each  cone has the same value of $\E[ |x|^2]$. For a cone $dC$, say that the
{\em length} of the intersection $dC \cap R$ is the maximum $|x|$ over
$x \in dC \cap R$.  What we show is that a noticeable volume of $R$ is
contained in  cones $dC$ such that $dC \cap R$ has length at most
$1-\alpha$ for some $\alpha$ noticeably greater than zero.  This in
turn gives us the result we want.  Now, to the specifics.

Let $\alpha$ be some small quantity greater than 0 to be determined
later.  We begin by showing that a substantial part of the surface of
$R$ lies inside $(1-\alpha)\Bn$.  To do so, we first provide 
bounds on the volume of $R \cap (1-\alpha)\Bn$.  We know $\vol(R) = r\ballvol$.
Also note that $\vol(\Bn - (1-\alpha)\Bn) = (1 - (1 - \alpha)^n)\ballvol$.
So,
\begin{eqnarray}
\vol(R \cap (1-\alpha)\Bn)& \geq & r\ballvol - (1 - (1 -
\alpha)^n)\ballvol \nonumber\\
& = & \ballvol(r + (1 - \alpha)^n - 1) \nonumber \\
& \geq & \ballvol(r - \alpha n).\label{lowervol} 
\end{eqnarray}
Also, 
\begin{eqnarray}
\vol((1-\alpha)\Bn - R) & \geq & (1 - \alpha)^n \ballvol - r\ballvol
\nonumber\\
& \geq & \ballvol(1 - \alpha n - r). \label{uppervol}
\end{eqnarray}
We now use the following isoperimetric inequality due to Lov\'{a}sz and
Simonovits \cite{LovaszSi90}.

\begin{fact}[Theorem 2.1 of \cite{LovaszSi90}]
Let $T$ be a convex set in $\R^n$ partitioned into two sets $S$ and $T
- S$ by an $(n-1)$-dimensional surface with surface area
($(n-1)$-dimensional volume) $a$.  Then,
\begin{eqnarray*}
a & \geq & \inv{diam(T)}\min[\vol(S),\; \vol(T - S)]
\end{eqnarray*}
where $diam(T)$ is the maximum distance between two points in $T$.
\end{fact}
We apply this inequality with $S = R \cap (1-\alpha)\Bn$, $T =
(1-\alpha)\Bn$, and the separating surface being $\partial R \cap T$
(the surface of $R$ inside $T$), to get (using equations (\ref{lowervol})
and (\ref{uppervol})): 
\begin{eqnarray}
\lefteqn{\vol_{n-1}(\partial R \cap (1-\alpha)\Bn) \; \geq }
\nonumber\\
& & \shalf
\ballvol \min \left[ \; {r - \alpha n}, \; {1 - \alpha n - r}\;
\right]. \label{volineq} 
\end{eqnarray}
We also need the following.
\begin{claim}\label{claim1}
The integrated volume over all infinitesimal cones
of $R$ that are contained in $(1-\alpha)\Bn$ is at least
$\frac{\tau}{n} \vol_{n-1}(\partial R \cap (1-\alpha)Bn)$.
\end{claim}
That is, the volume of the set of  points $x \in R$ such that the ray
from the origin through $x$ hits the surface of $R$ while still inside
$(1-\alpha)\Bn$ is at least the above quantity.

\noindent
{\bf Proof of claim. } See appendix.

\medskip
\noindent
To finish the proof of the lemma, we consider two cases.

\noindent
{\bf Case 1: } $r \leq 1/2$, so the ``min'' in inequality
(\ref{volineq}) is ${r - \alpha n}$.

For this case, we choose $\alpha = \frac{r}{2n}$.  So,
$\vol_{n-1}(\partial R \cap (1-\alpha)\Bn) \geq \half \ballvol
(r/2) = r\ballvol / 4$. Applying Claim \ref{claim1} we 
have that the volume of cones of $R$ ending within $(1-\alpha)\Bn$ is
at least $(\frac{\tau}{4n})r\ballvol$.  In other words, at least a
$\frac{\tau}{4n}$ fraction of $R$ is inside such cones.

Now, each cone $C$ that ends within $(1 - \alpha)\Bn$ has length at most
$(1 - \alpha)$ which means that $\E_{C}[\; |x|^2\;] \leq (1 -
\alpha)^2 \ebx$. Each cone $C$ that doesn't end within
$(1 - \alpha)\Bn$ has $\E_{C}[\; |x|^2\;] \leq \ebx$.
So, integrating over all cones, we have:
\begin{eqnarray*}
\erx & \leq & \frac{\tau}{4n} (1 - \alpha)^2 \ebx  + \left(1 -
\sfrac{\tau}{4n}\right) \ebx \\
& \leq & \ebx \left( 1 - \sfrac{\tau\alpha}{2n} + \sfrac{\tau
\alpha^2}{4n}\right) \\
& \leq & \ebx \left( 1 - \sfrac{\tau\alpha}{4n} \right) \\
& \leq & \ebx \left( 1 - \sfrac{\tau r}{8n^2} \right) \Box
\end{eqnarray*}

\noindent
{\bf Case 2: } $r \geq 1/2$.  Here we set $\alpha = \frac{1 - r}{2n}$.
The proof is similar to Case 1 and proceeds as follows.

Using equation (\ref{volineq}) we have $\vol_{n-1}(\partial R \cap
(1-\alpha)\Bn) \geq (1-r)\ballvol / 4$.  Applying Claim \ref{claim1}
we get that at least a fraction $f = \frac{(1-r)}{r}
(\frac{\tau}{4n})$ of $R$ is inside cones of $R$ ending within
$(1-\alpha)\Bn$.  So, we get:
\begin{eqnarray*}
\erx & \leq &  f(1-\alpha)^2 \ebx + (1-f)\ebx  \\
& \leq & \ebx \left( 1 - 2f\alpha + f\alpha^2\right) \\
& \leq & \ebx \left( 1 - f\alpha \right) \\
& \leq & \ebx \left( 1 - \sfrac{(1-r)^2\tau}{8n^2r} \right) \Box
\end{eqnarray*}
\noindent
So we have proved Lemma \ref{vol-lemma}. \qed


\noindent
{\bf Proof of Theorem \ref{moment theorem}. } First, lemmas
\ref{ball-lemma} and \ref{at-least-lemma} imply that $\E[(w
\cdot x)^2] \geq \frac{1}{n+2}$ for the following reason. By
symmetry (all directions $w \in \virrel(R)$ have the same
value of $\E_R[ (w \cdot x)^2]$) and the fact that $\virrel(R)$
is an $(n-k)$-dimensional space, we have
$\E_R[ (w \cdot x)^2]   =  \sfrac{1}{n-k} \E_R[|\ir(x)|^2]$.  This
quantity by Lemma \ref{at-least-lemma} is at least $\sfrac{1}{n-k}
\E_{\Bn}[ |\ir(x)|^2]$, which (by symmetry and lemma
\ref{ball-lemma}) equals $\inv{n+2}.$

Now, using $|x|^2 = |\rel(x)|^2 + |\ir(x)|^2$, we can apply lemmas
\ref{ball-lemma} and \ref{vol-lemma} to get
$\E_R[|\rel(x)|^2] + \E_R[|\ir(x)|^2]  \leq  \E_{\Bn}[|\rel(x)|^2] +
\E_{\Bn}[|\ir(x)|^2] - \Delta\sfrac{n}{n+2}$. This implies
\begin{eqnarray*}
\E_R[|\rel(x)|^2]  & \leq &  \E_{\Bn}[|\rel(x)|^2] -
\sfrac{n\Delta}{n+2} \mbox{\ \ \ \ \ (Lemma \ref{at-least-lemma})}\\
& = & \sfrac{k}{n+2} -  \sfrac{n\Delta}{n+2}.\\
& &   \mbox{(by observation in Lemma \ref{ball-lemma})}
\end{eqnarray*}
Now, by definition of $v$ (in particular the fact that 
$\E_R[(v \cdot x)^2] \leq \inv{k}\E_R[|\rel(x)|^2]$), we have 
$\E_R[(v \cdot x)^2] \leq \inv{n+2} - \frac{n\Delta}{k(n+2)}$.
The theorem follows. \qed
\section{The algorithm}

This section describes the learning algorithm. 
We assume the target concept $P$ is an intersection of $\ell$ halfspaces
in $n$ dimensions, such that $\vrel(P)$ has dimension $k$.  We want to
learn with error $\epsilon$ and failure probability at most $\delta$. 

With section \ref{sec:find one} on hand, the recursive algorithm is simple: 
We first use the procedure of section \ref{sec:find one} to find
a unit length vector $u$ whose irrelevant component is at most
$\epsilon_1$ (to be specified later), with probability $\geq 1 - \delta/4$.
The algorithm will then consider slices perpendicular to $u$. Each
slice is of the form $$\{x\in B_n : m\epsilon_1 \leq u\cdot x
\leq (m+1)\epsilon_1\}$$
where $m$ is an integer and $\epsilon_1$ is the thickness of each
slice. So, there are $2/\epsilon_1$ slices total.  Call a slice
``big'' if its volume is at least
$\inv{12}\epsilon_1\epsilon\gamma_n $.
We will recursively PAC-learn each big
slice to error at most $\epsilon /6 $ and probability of 
failure at most ${\delta\epsilon\over 48n}$. (We later show that this
is a ``$k-1$'' problem.)   We just classify
each small slice as negative. The final hypothesis produced will in
essence be a depth-$k$ ``linear-threshold decision tree'', where a
decision feeds an example into the hypothesis for the appropriate
slice.  Before giving a more detailed description
of the algorithm, we prove 
some technical facts needed for the analysis
of the algorithm. 

Suppose the (unknown) positive region is 
$$P=\{ x\in B_n : a_i \cdot x\leq a_{io}\for i=1,2,\ldots, \ell\}$$
where the $a_i$'s are unit length vectors that 
span the $k$ dimensional relevant space $\V = \vrel(P)$. 
Let $u_1$ be the relevant component of $u$. Each $a_i=\lambda_iu_1 + b_i$
for some real number $\lambda_i$ and vector $b_i\in \V$ orthogonal to
$u_1$. Let $c_i=\lambda_i u + b_i$ and let 
$$P_1=\{ x\in B_n : c_i \cdot x\leq a_{io}\for i=1,2,\ldots, \ell\}.$$

For each slice $\{x\in B_n : m\epsilon_1 \leq u\cdot x \leq (m+1)\epsilon_1\}$
(which will henceforth be denoted \slice$(m)$) with $m\geq 0$,
let $P_1(m)$ be the intersection of $P_1$ with the hyperplane 
$\{x : u\cdot x = (m+1)\epsilon_1\}$.  For $m\leq -1$, let $P_1(m)$
be the intersection of $P_1$ with $\{x : u\cdot x = m\epsilon_1\}$. Note that
$P_1(m)$ is an $n-1$ dimensional convex set with a ``relevant space'' of
dimension $k-1$, since $b_1,b_2,\ldots, b_l$ span the $k-1$ dimensional
space of vectors in $\V$ orthogonal to $u_1$. We want to assume that all
examples that land in \slice$(m)$ are labeled according to $P_1(m)$
so we can recurse on a relevant space of one less dimension, but
this will introduce some errors which need to be analyzed; we first prove some
facts which will be useful for this analysis.

For $x\in B_n$, define $f(x)$ to be the projection of $x$ onto the
smaller bounding plane of its slice.  That is, 
if $x=\lambda u + y$, with $y$ perpendicular to $u$, then
$f(x) = \lceil {\lambda\over\epsilon_1}\rceil \epsilon_1 u +y$ if
$\lambda$ is nonnegative,  and $f(x)=\lfloor {\lambda\over\epsilon_1}
\rfloor \epsilon_1 u +y$ if $\lambda$ is negative.  We call an $x\in B_n$
``good'' (or say $x\in $GOOD) if it satisfies {\em all} of the following:

(i) $x$ belongs to both $P$ and $P_1$ or $x$ belongs to neither.

(ii) $f(x)$ belongs to both $P$ and $P_1$ or it belongs to neither.

(iii) $f(x)$ belongs to $B_n$.

(iv) Either both $x$ and $f(x)$ belong to $P$ or neither does.

\noindent
In particular, for an example $x$ from GOOD $\cap $ \slice$(m)$,
its label according to $P$ is the same as the label of $f(x)$
according to $P_1(m)$.

The algorithm will take each labeled example $x$ in a big slice \slice$(m)$,
attach the same label to $f(x)$ (if $f(x)$ is in $B_n$) and treat
these as labeled examples for $P_1(m)$.  Say that $x \in  \Bn$ is
``bad'' (or $x \in$BAD) if $x \not\in$GOOD.  We then have the following.

\begin{lemma}
The volume of the bad set  is at most $12 \ell \gamma_{n-1} \epsilon_1
+\gamma_n n\epsilon_1\leq 13 \ell n\gamma_n\epsilon_1$.
\end{lemma}

{\bf Proof: }
We first observe that $|a_i-c_i|\leq 
2\epsilon_1$ since we may assume $|\lambda_i| < 2$.  So for
$x\in B_n$, we have $a_ix-2\epsilon_1\leq c_ix 
\leq a_ix+2\epsilon_1$ and so the set of $x$ that violates (i) is contained
in $\cup_{i=1}^l \{x\in B_n : a_{io}-2\epsilon_1\leq a_ix\leq a_{io}
+2\epsilon_1\}$; this has volume at most $4\ell\epsilon_1 \gamma_{n-1}$,
since $|a_i|=1$ for all $i$.  If $x$ violates (ii), then $f(x)$
belongs to $\cup_{i=1}^\ell \{x\in B_n : a_{io}-2\epsilon_1\leq a_ix\leq a_{io}
+2\epsilon_1\}$ by the above. But $|x-f(x)|\leq \epsilon_1$, so we have
that $x$ belongs to the set
$\cup_{i=1}^\ell \{x\in B_n : a_{io}-2\epsilon_1-\epsilon_1\leq a_ix\leq a_{io}
+2\epsilon_1+\epsilon_1\}$ whose volume is at most
$6\ell \epsilon_1\gamma_{n-1}$. 
If $f(x)$ does not belong to $B_n$,
then $x$ belongs to $\{x : 1-\epsilon_1\leq |x|\leq 1\}$, a set of 
volume $\leq \gamma_n n\epsilon_1$.  If $x$ violates (iv),
then note that $x$ belongs to the
set $\cup_{i=1}^\ell \{x\in B_n : a_{io}-\epsilon_1\leq a_ix\leq a_{io}
+\epsilon_1\}$ whose volume is at most $2\ell\epsilon_1 \gamma_{n-1}$.
\qed 

We will define a slice to be ``bad'' if the volume of the bad set
intersected with the slice is at least $$\epsilon_3\quad\times \hbox{
(the volume of the slice)}$$ where $\epsilon_3$ will be specified
later.  Otherwise, the slice is ``good.''

We use  $E(k,n,\epsilon ,\delta )$ to denote the
number of examples the algorithm will need, where $k$, $n$,
$\epsilon$, and $\delta$ stand for the usual quantities.  Let us
assume by induction that we have already computed 
$E(k-1,n-1,\epsilon /6 ,
{\delta\epsilon\over 48n})$, and abbreviate this by $N(k-1)$.
We are now ready to describe the algorithm.

\begin{center}{\bf THE ALGORITHM}\end{center}

\begin{enumerate}
\item Find with probability at least $1 - \delta/4$, a vector $u$ with
irrelevant component at most $\epsilon_1$ as described
above.\footnote{As described in 
section \ref{sec:find one}, we first sample to check whether ``all
positive'' or ``all negative'' are sufficiently close hypotheses,
before running the procedure. $\delta/4$ is the combined failure probability.}
\item Declare all small slices to be all negative.

\item Pick $\displaystyle \frac{24}{\epsilon_1\epsilon}
\left[N(k-1)+4\log(\sfrac{8}{\delta\epsilon_1})\right]$
labeled examples from $B_n$.

\item If any big slice has less than $N(k-1)$ examples in it, halt and
declare failure.

\item Otherwise, take the first
$N(k-1)$ examples that land in each big slice, attach the same label to
$f(x)$ (as $x$) and recursively learn the $n-1$ dimensional problem with
$k-1$ relevant directions with error at most $\epsilon / 6$ and 
failure probability at most $\delta\epsilon / (48n)$.

The overall concept learned is: if $x$ lies in a big slice,
then it is positive iff $f(x)$ is positive according to the recursively
learned concept.
\end{enumerate}

\begin{lemma}
In the above algorithm, the probability we halt in step (4) is at most
$\delta/4$.
\end{lemma}

\noindent
{\bf Proof: } In a given big slice, the expected number of examples
we get is at least:  $2[N(k-1) + 4\log(\frac{8}{\epsilon_1\delta})]$.  The
probability we get less that half that many is at most (by Chernoff)
$e^{-2[N(k-1) + 4\log(8/\epsilon_1\delta)]/8} \leq
\epsilon_1\delta/8$.  So, the chance there is {\em any} such slice is
at most $\delta/4$. \qed

We are now ready to specify $\epsilon_3$. We choose: \ 
$\displaystyle \epsilon_3 \; = \; \left({\epsilon \over 24n}\right)
{1\over N(k-1) }.$

We call a good slice ``corrupt'' if at least one (among the first 
$N(k-1)$) example in the slice lands in 
the bad region. [Note that we do not know
which slices are corrupt. So corruptness is used only in the analysis, not 
by the algorithm.] 

The expected number of examples that land in the bad region of  
a good slice is at most $\epsilon_3$ times the number of examples in the
slice, so is at most ${\epsilon \over 24n}$; thus by Markov inequality, the 
probability that some example lands in the bad region is at most 
${\epsilon \over 24n}$. Now noting that different good slices
being corrupted are independent events, we apply Chernoff bounds to get that 
the probability there are more than $(\frac{\epsilon}{6n\epsilon_1})$
corrupt slices is at most $e^{-\epsilon/(36 n\epsilon_1)}$ which is
at most $\delta/4$ for sufficiently small $\epsilon_1$.

Since the volume of any slice is at most $\gamma_{n-1}\epsilon_1\leq n\gamma_n
\epsilon_1$, we have:
\begin{lemma}\label{prlemma1}
The probability that the total volume of corrupt slices
exceeds $\gamma_n\epsilon/6$ is at most $\delta /4$. 
\end{lemma}

In the uncorrupted good slices, there is still a probability
$\frac{\delta\epsilon}{48n}$ 
that the recursively-called algorithm will fail.  So, the expected
number of such slices (which we will call ``failed slices'') is at
most $\frac{\delta\epsilon}{24\epsilon_1n}$. By Markov's inequality,
the probability there are more than $\frac{\epsilon}{6\epsilon_1n}$
failed slices is at most $\delta/4$.  Again, since the volume of a
slice is at most $n\gamma_n \epsilon_1$ we have:
\begin{lemma}\label{prlemma2}
The probability that the total volume of failed slices exceeds
$\gamma_n\epsilon /6$ is at most $\delta /4$. 
\end{lemma}
We have not specified the choice of $\epsilon_1$ yet; now we do so. It is 
chosen so that $13 \ell n \epsilon_1 /\epsilon_3\leq \epsilon / 6$.

\medskip

\noindent
{\bf Confidence analysis: } We want that the first vector $u$ has
irrelevant component at most $\epsilon_1$, that all big slices get
sufficient examples, and that lemmas \ref{prlemma1} and \ref{prlemma2}
above hold.  Each fails with probability at most $\delta/4$ so our
total failure probability is at most $\delta$.

\medskip

\noindent
{\bf Error analysis:} In the worst case, our hypothesis
is completely erroneous in the small, bad, corrupt, and failed slices.
In addition, there are errors in the remaining slices from the
recursive calls.  The total error volume is therefore at most as follows:

\begin{itemize}

\item Small Slices : Total Volume $\leq \epsilon\gamma_n /6$.
\item Bad Slices : Total volume $\leq 13 \ell n \gamma_n \epsilon_1
/\epsilon_3 \; \leq \epsilon\gamma_n/6$.
\item Corrupt Slices : Total Volume $\leq \epsilon\gamma_n / 6$.
\item Failed Slices : Total volume $\leq \epsilon\gamma_n / 6$.
\item Remaining slices : Error $\leq \epsilon /6$ of volume.
So total error volume is at most $\epsilon \gamma_n / 6$.
\end{itemize}
Thus, the total error  of the hypothesis produced is at most
$5\epsilon/6 < \epsilon$.

\medskip

\noindent
{\bf Number of examples needed: }
To find the first vector $u$ to error less than $\epsilon_1$ requires
$O(\epsilon_1^{-4} \cdot poly(n,\ell, \inv{\epsilon},\log\inv{\delta}))$
examples. This quantity is $O(N(k-1)^4 \cdot
poly(n,\ell, \inv{\epsilon},\log\inv{\delta}))$.  The additional number of
examples needed in step (2) of the algorithm is $O(N(k-1)^2 \cdot
poly(n,\ell, \inv{\epsilon},\log\inv{\delta}))$. Thus, it is clear that
since $k$ is a constant, this will be polynomial in the desired parameters,
although it is doubly-exponential in $k$.



\begin{thebibliography}{1}

\bibitem{Baum90a}
E.~B. Baum.
\newblock Polynomial time algorithms for learning neural nets.
\newblock In {\em Proceedings of the Third Annual Workshop on Computational
  Learning Theory}, pages 258--272. Morgan Kaufmann, 1990.

\bibitem{Baum90}
Eric~B. Baum.
\newblock On learning a union of half spaces.
\newblock {\em Journal of Complexity}, 6(1):67--101, March 1990.

\bibitem{Berger87}
M.~Berger.
\newblock {\em Geometry}.
\newblock Springer-Verlag, 1987.

\bibitem{BlumRi92}
A.~Blum and R.~Rivest.
\newblock Training a 3-node neural network is {NP-Complete}.
\newblock {\em Neural Networks}, 5:117--127, 1992.

\bibitem{BlumerEhHaWa89}
Anselm Blumer, Andrzej Ehrenfeucht, David Haussler, and Manfred~K. Warmuth.
\newblock Learnability and the {V}apnik-{C}hervonenkis dimension.
\newblock {\em Journal of the ACM}, 36(4):929--965, 1989.

\bibitem{LongWa90}
P.M. Long and M.K. Warmuth.
\newblock Composite geometric concepts and polynomial predictability.
\newblock In {\em Proceedings of the Third Annual Workshop on Computational
  Learning Theory}, pages 273--287. Morgan Kaufmann, 1990.

\bibitem{LovaszSi90}
L.~Lov\'{a}sz and M.~Simonovits.
\newblock The mixing rate of markov chains, an isoperimetric inequality, and
  computing the volume.
\newblock In {\em Proceedings of the 31st Annual Symposium on Foundations of
  Computer Science}, volume~I, pages 346--354, St. Louis, October 1990.

\bibitem{Megiddo86}
N.~Megiddo.
\newblock On the complexity of polyhedral separability.
\newblock Technical Report RJ 5252, IBM Almaden Research Center, August 1986.

\bibitem{Morrison90}
D.~F. Morrison.
\newblock {\em Multivariate Statistical Methods}, 3rd Ed.
\newblock McGraw-Hill, 1990.
\end{thebibliography}

\appendix

\section*{Appendix}
{\bf Proof of Lemma \ref{tau central}.}
Let $y$ be the closest point to $\mu_R$ on the boundary of
$R$. Let $\tau = |y - \mu|$.  Since $R$ is convex and $y$ is the
closest point, the there are
no points in $R$ on the opposite side of the hyperplane $H$ through $y$
perpendicular to the vector $y-\mu$.  It is a fact that in
general if a point $z$
is not in a convex region $R\subseteq {\bf R}^n$, 
then neither is the point $\mu_R +
n(\mu_R - z)$ \cite{Berger87}.  Thus, all points in $R$ lie between the
hyperplane$H$  and the
hyperplane parallel to $H$ through $\mu_R + n(\mu - y)$.
These hyperplanes are separated by distance $(n+1)\tau$, so the volume
of $R$ is at most $(n+1)\tau \gamma_{n-1} \leq (n+1)\tau
(2\ballvol\sqrt{n})$.  Setting this to $r \ballvol$ yields the desired
inequality for $\tau$.
\qed

{\bf Proof of Claim \ref{claim1}. }
Let $\rho$ be an infinitesimal patch of $\partial R$ inside $(1 -
\alpha)\Bn$, with surface area $dS$.  Let $x$ be a point on $\rho$ and
let $\theta$ be the angle between the tangent hyperplane $H$ to $\partial
R$ and the hyperplane normal to $x$ (treating $x$ as a vector).  Then, $dS\cos
\theta$ is the surface area of $\rho$ projected onto the hyperplane
normal to $x$.
So, the volume of the cone from the origin to $\rho$ is $(dS \cos
\theta) \frac{|x|}{n}$. 

We were given (in the statement of the lemma) that $R$ contains a ball
of radius $\tau$ about the origin.  Also, $R$ is convex which implies
that all points on the far side of hyperplane $H$ from the origin are
not in $R$.  So, $|x|\cos \theta$, which is the closest distance of
$H$ to the origin, is at least $\tau$.  Now combining this with the
result of the last paragraph we get that the volume of the cone from
the origin to $\rho$ is at least $dS\frac{\tau}{n}$.  Integrating over
all infinitesimal patches $\rho$ yields the claim.  \qed


{\bf Proof of Lemma \ref{at-least-lemma}. }
Let $\V = \vrel(R)$.  Recall $\ir(x) = \proj(x,\virrel(R))$ and
$\rel(x) = \proj(x,\V)$.

For a point $y \in \Bn \cap \V$, let $\rel^{-1}(y)$ be the collection
of all points $x \in \Bn$ such that $\rel(x) = y$.  That is,
$\rel^{-1}(y)$ is a ``fiber'' of all points with $y$ as their
component in space $\V$.  Notice that for any $y$, $\E_{x \in
\rel^{-1}(y)}[ |ir(x)|^2 ]$ is just a function of the distance of $y$
from the origin, and moreover the expectation decreases with
increasing $|y|$.  In fact, when $|y| = 1$, the quantity is zero.

Now, let $dC$ be a cone in $\Bn \cap \V$ of infinitesimal solid angle
$d\theta$ and let $d\tC$ be the set $\rel^{-1}(dC)$ (extending
$\rel^{-1}$ in the obvious way to sets).  By symmetry, $\E_{d\tC}[
|\ir(x)|^2] = \E_{\Bn}[ |\ir(x)|^2]$; this is because cone $dC$ has
the same proportion of points at each distance from the origin as does
the entire set $\Bn \cap \V$.

Because $R$ is convex and contains the origin, $dC \cap R$ is simply
cone $dC$ with all points greater than some distance from the origin
removed.  So, by our observation above, $\E_{\rel^{-1}(dC \cap
R)}[  |\ir(x)|^2 ]  \geq \E_{\rel^{-1}(dC)}[ |\ir(x)|^2]$.  By
definition of $\V$, we know $\rel^{-1}(dC \cap R) = d\tC \cap R$, so
$\E_{(d\tC \cap R)}[  |\ir(x)|^2 ]  \geq \E_{d\tC}[
|\ir(x)|^2]$.
Since cone $dC$ was arbitrary, this holds for all such infinitesimal
cones and we can integrate the expectation over them to get $\E_{R}[
|\ir(x)|^2 ]  \geq \E_{\Bn}[ |\ir(x)|^2]$. 
\qed


\end{document}


