bibliography.bib

@incollection{goodfellow14,
	abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.},
	archiveprefix = {arXiv},
	author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
	booktitle = {{Advances in Neural Information Processing Systems 27}},
	comment = {published = 2014-06-10T18:58:17Z, updated = 2014-06-10T18:58:17Z},
	eprint = {1406.2661v1},
	localfile = {bibliography_resources/goodfellow - Generative Adversarial Networks.pdf},
	month = jun,
	pages = {2672--2680},
	primaryclass = {stat.ML},
	title = {{Generative Adversarial Networks}},
	x-fetchedfrom = {arXiv.org},
	year = {2014}
}

@inproceedings{drover18,
	abstract = {3D pose estimation from a single image is a challenging task in computer vision. We present a weakly supervised approach to estimate 3D pose points, given only 2D pose landmarks. Our method does not require correspondences between 2D and 3D points to build explicit 3D priors. We utilize an adversarial framework to impose a prior on the 3D structure, learned solely from their random 2D projections. Given a set of 2D pose landmarks, the generator network hypothesizes their depths to obtain a 3D skeleton. We propose a novel Random Projection layer, which randomly projects the generated 3D skeleton and sends the resulting 2D pose to the discriminator. The discriminator improves by discriminating between the generated poses and pose samples from a real distribution of 2D poses. Training does not require correspondence between the 2D inputs to either the generator or the discriminator. We apply our approach to the task of 3D human pose estimation. Results on Human3.6M dataset demonstrates that our approach outperforms many previous supervised and weakly supervised approaches.},
	author = {Drover, Dylan and {M. V}, Rohith and Chen, Ching-Hang and Agrawal, Amit and Tyagi, Ambrish and Huynh, Cong Phuoc},
	booktitle = {{Computer Vision -- ECCV 2018 Workshops}},
	pages = {78--94},
	publisher = {Springer International Publishing},
	title = {{Can 3D Pose Be Learned from 2D Projections Alone?}},
	year = {2019}
}

@article{ionescu14,
	author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu, Cristian},
	doi = {10.1109/tpami.2013.248},
	journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
	localfile = {bibliography_resources/ionescu - Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments.pdf},
	month = {Jul},
	number = {7},
	pages = {1325--1339},
	publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
	title = {{Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments}},
	volume = {36},
	x-fetchedfrom = {DOI},
	year = {2014}
}

@article{sun17,
	author = {Liang, Shuang and Sun, Xiao Wei and Wei, Yichen},
	journal = {Computer Vision and Image Understanding},
	pages = {1--8},
	title = {{Compositional Human Pose Regression}},
	volume = {176-177},
	year = {2018}
}

@article{martinez17,
	abstract = {  Following the success of deep convolutional networks, state-of-the-art
methods for 3d human pose estimation have focused on deep end-to-end systems
that predict 3d joint locations given raw image pixels. Despite their excellent
performance, it is often not easy to understand whether their remaining error
stems from a limited 2d pose (visual) understanding, or from a failure to map
2d poses into 3-dimensional positions. With the goal of understanding these
sources of error, we set out to build a system that given 2d joint locations
predicts 3d positions. Much to our surprise, we have found that, with current
technology, "lifting" ground truth 2d joint locations to 3d space is a task
that can be solved with a remarkably low error rate: a relatively simple deep
feed-forward network outperforms the best reported result by about 30\% on
Human3.6M, the largest publicly available 3d pose estimation benchmark.
Furthermore, training our system on the output of an off-the-shelf
state-of-the-art 2d detector (\ie, using images as input) yields state of the
art results -- this includes an array of systems that have been trained
end-to-end specifically for this task. Our results indicate that a large
portion of the error of modern deep 3d pose estimation systems stems from their
visual analysis, and suggests directions to further advance the state of the
art in 3d human pose estimation.
},
	archiveprefix = {arXiv},
	author = {Martinez, Julieta and Hossain, Rayat and Romero, Javier and Little, James J.},
	comment = {published = 2017-05-08T21:48:37Z, updated = 2017-08-04T18:36:24Z, Accepted to ICCV 17},
	eprint = {1705.03098v2},
	journal = {2017 IEEE International Conference on Computer Vision (ICCV)},
	localfile = {./bibliography_resources/martinez - a simple baseline for 3d hume pose estimation.pdf},
	month = aug,
	primaryclass = {cs.CV},
	title = {{A simple yet effective baseline for 3d human pose estimation}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@article{zhou16,
	author = {Zhou, Xiaowei and Zhu, Menglong and Leonardos, Spyridon and Derpanis, Konstantinos G. and Daniilidis, Kostas},
	journal = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {4966--4975},
	title = {{Sparseness Meets Deepness: 3D Human Pose Estimation from Monocular Video}},
	year = {2015}
}

@article{tekin17,
	author = {Tekin, Bugra and M{\'a}rquez-Neila, Pablo and Salzmann, Mathieu and Fua, Pascal},
	journal = {2017 IEEE International Conference on Computer Vision (ICCV)},
	pages = {3961--3970},
	title = {{Learning to Fuse 2D and 3D Image Cues for Monocular Body Pose Estimation}},
	year = {2016}
}

@inproceedings{ioffe15,
	abstract = {  Training Deep Neural Networks is complicated by the fact that the
distribution of each layer's inputs changes during training, as the parameters
of the previous layers change. This slows down the training by requiring lower
learning rates and careful parameter initialization, and makes it notoriously
hard to train models with saturating nonlinearities. We refer to this
phenomenon as internal covariate shift, and address the problem by normalizing
layer inputs. Our method draws its strength from making normalization a part of
the model architecture and performing the normalization for each training
mini-batch. Batch Normalization allows us to use much higher learning rates and
be less careful about initialization. It also acts as a regularizer, in some
cases eliminating the need for Dropout. Applied to a state-of-the-art image
classification model, Batch Normalization achieves the same accuracy with 14
times fewer training steps, and beats the original model by a significant
margin. Using an ensemble of batch-normalized networks, we improve upon the
best published result on ImageNet classification: reaching 4.9\% top-5
validation error (and 4.8\% test error), exceeding the accuracy of human raters.
},
	archiveprefix = {arXiv},
	author = {Ioffe, Sergey and Szegedy, Christian},
	booktitle = {{Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37}},
	comment = {published = 2015-02-11T01:44:18Z, updated = 2015-03-02T20:44:12Z},
	eprint = {1502.03167v3},
	localfile = {bibliography_resources/ioffe - Batch Normalization.pdf},
	month = mar,
	primaryclass = {cs.LG},
	series = {{ICML'15}},
	title = {{Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift}},
	x-fetchedfrom = {arXiv.org},
	year = {2015}
}

@article{yasin16,
	author = {Yasin, Hashim and Iqbal, Umar and Kr{\"u}ger, Bj{\"o}rn and Weber, Alexander and Gall, Juergen},
	journal = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {4948--4956},
	title = {{A Dual-Source Approach for 3D Pose Estimation from a Single Image}},
	year = {2015}
}

@article{wandt19,
	author = {Wandt, Bastian and Rosenhahn, Bodo},
	journal = {ArXiv},
	title = {{RepNet: Weakly Supervised Training of an Adversarial Reprojection Network for 3D Human Pose Estimation}},
	volume = {abs/1902.09868},
	year = {2019}
}

@article{bogo16,
	author = {Bogo, Federica and Kanazawa, Angjoo and Lassner, Christoph and Gehler, Peter and Romero, Javier and Black, Michael J.},
	journal = {Computer Vision -- ECCV 2016},
	localfile = {bibliography_resources/bogo - keep it smpl.pdf},
	number = {},
	title = {{Keep it SMPL: Automatic Estimation of 3D Human Pose and Shape from a Single Image}},
	volume = {},
	year = {2016}
}

@article{moreno-noguer16,
	abstract = {  This paper addresses the problem of 3D human pose estimation from a single
image. We follow a standard two-step pipeline by first detecting the 2D
position of the $N$ body joints, and then using these observations to infer 3D
pose. For the first step, we use a recent CNN-based detector. For the second
step, most existing approaches perform 2$N$-to-3$N$ regression of the Cartesian
joint coordinates. We show that more precise pose estimates can be obtained by
representing both the 2D and 3D human poses using $N\times N$ distance
matrices, and formulating the problem as a 2D-to-3D distance matrix regression.
For learning such a regressor we leverage on simple Neural Network
architectures, which by construction, enforce positivity and symmetry of the
predicted matrices. The approach has also the advantage to naturally handle
missing observations and allowing to hypothesize the position of non-observed
joints. Quantitative results on Humaneva and Human3.6M datasets demonstrate
consistent performance gains over state-of-the-art. Qualitative evaluation on
the images in-the-wild of the LSP dataset, using the regressor learned on
Human3.6M, reveals very promising generalization results.
},
	archiveprefix = {arXiv},
	author = {Moreno-Noguer, Francesc},
	comment = {published = 2016-11-28T07:36:31Z, updated = 2016-11-28T07:36:31Z},
	eprint = {1611.09010v1},
	journal = {2017 IEEE Conference on Computer Vision and Pattern Recognition},
	localfile = {bibliography_resources/moreno-noguer - 3d human pose estimation from a single image.pdf},
	month = nov,
	primaryclass = {cs.CV},
	title = {{3D Human Pose Estimation from a Single Image via Distance Matrix Regression}},
	x-fetchedfrom = {arXiv.org},
	year = {2016}
}

@article{kostrikov14,
	author = {Kostrikov, Ilya and Gall, J{\"u}ergen},
	doi = {10.5244/c.28.80},
	journal = {Proceedings of the British Machine Vision Conference 2014},
	localfile = {bibliography_resources/kostrikov - depth sweep regression forests.pdf},
	publisher = {British Machine Vision Association},
	title = {{Depth Sweep Regression Forests for Estimating 3D Human Pose from Images}},
	x-fetchedfrom = {DOI},
	year = {2014}
}

@article{chen17,
	abstract = {  We explore 3D human pose estimation from a single RGB image. While many
approaches try to directly predict 3D pose from image measurements, we explore
a simple architecture that reasons through intermediate 2D pose predictions.
Our approach is based on two key observations (1) Deep neural nets have
revolutionized 2D pose estimation, producing accurate 2D predictions even for
poses with self occlusions. (2) Big-data sets of 3D mocap data are now readily
available, making it tempting to lift predicted 2D poses to 3D through simple
memorization (e.g., nearest neighbors). The resulting architecture is trivial
to implement with off-the-shelf 2D pose estimation systems and 3D mocap
libraries. Importantly, we demonstrate that such methods outperform almost all
state-of-the-art 3D pose estimation systems, most of which directly try to
regress 3D pose from 2D measurements.
},
	archiveprefix = {arXiv},
	author = {Chen, Ching-Hang and Ramanan, Deva},
	comment = {published = 2016-12-20T06:45:49Z, updated = 2017-04-11T07:33:51Z, Demo code: https://github.com/flyawaychase/3DHumanPose},
	eprint = {1612.06524v2},
	journal = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	localfile = {bibliography_resources/chen - 3d = 2d + matching.pdf},
	month = apr,
	primaryclass = {cs.CV},
	title = {{3D Human Pose Estimation = 2D Pose Estimation + Matching}},
	title2 = {{3D Human Pose Estimation = 2D Pose Estimation + Matching}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@article{gower75,
	author = {Gower, J. C.},
	doi = {10.1007/bf02291478},
	issn = {1860-0980},
	journal = {Psychometrika},
	localfile = {bibliography_resources/gower - generalized procrustes analysis.pdf},
	month = {Mar},
	number = {1},
	pages = {33--51},
	publisher = {Springer Nature},
	title = {{Generalized procrustes analysis}},
	url = {http://dx.doi.org/10.1007/BF02291478},
	volume = {40},
	x-fetchedfrom = {DOI},
	year = {1975}
}

@article{tome17,
	author = {Tom{\`e}, Denis and Russell, Chris and Agapito, Lourdes},
	journal = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {5689--5698},
	title = {{Lifting from the Deep: Convolutional 3D Pose Estimation from a Single Image}},
	year = {2017}
}

@misc{rogez16,
	abstract = {  This paper addresses the problem of 3D human pose estimation in the wild. A
significant challenge is the lack of training data, i.e., 2D images of humans
annotated with 3D poses. Such data is necessary to train state-of-the-art CNN
architectures. Here, we propose a solution to generate a large set of
photorealistic synthetic images of humans with 3D pose annotations. We
introduce an image-based synthesis engine that artificially augments a dataset
of real images with 2D human pose annotations using 3D Motion Capture (MoCap)
data. Given a candidate 3D pose our algorithm selects for each joint an image
whose 2D pose locally matches the projected 3D pose. The selected images are
then combined to generate a new synthetic image by stitching local image
patches in a kinematically constrained manner. The resulting images are used to
train an end-to-end CNN for full-body 3D pose estimation. We cluster the
training data into a large number of pose classes and tackle pose estimation as
a K-way classification problem. Such an approach is viable only with large
training sets such as ours. Our method outperforms the state of the art in
terms of 3D pose estimation in controlled environments (Human3.6M) and shows
promising results for in-the-wild images (LSP). This demonstrates that CNNs
trained on artificial images generalize well to real images.
},
	archiveprefix = {arXiv},
	author = {Rogez, Gr{\'e}gory and Schmid, Cordelia},
	comment = {published = 2016-07-07T15:30:05Z, updated = 2016-10-28T12:43:51Z, 9 pages, accepted to appear in NIPS 2016},
	eprint = {1607.02046v2},
	localfile = {bibliography_resources/rogez - mocap-guided data augmentation.pdf},
	month = oct,
	primaryclass = {cs.CV},
	title = {{MoCap-guided Data Augmentation for 3D Pose Estimation in the Wild}},
	url = {http://arxiv.org/abs/1607.02046v2; http://arxiv.org/pdf/1607.02046v2},
	x-fetchedfrom = {arXiv.org},
	year = {2016}
}

@article{zhou18,
	author = {Zhou, Xiaowei and Zhu, Menglong and Pavlakos, Georgios and Leonardos, Spyridon and Derpanis, Konstantinos G. and Daniilidis, Kostas},
	journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
	pages = {901--914},
	title = {{MonoCap: Monocular Human Motion Capture using a CNN Coupled with a Geometric Prior}},
	volume = {41},
	year = {2017}
}

@inproceedings{zhou16_2,
	author = {Zhou, Xingyi and Sun, Xiao Wei and Zhang, Wenjun and Liang, Shuang and Wei, Yichen},
	booktitle = {{ECCV Workshops}},
	title = {{Deep Kinematic Pose Regression}},
	year = {2016}
}

@article{tekin16,
	author = {Tekin, Bugra and Rozantsev, Artem and Lepetit, Vincent and Fua, Pascal},
	journal = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {991--1000},
	title = {{Direct Prediction of 3D Body Poses from Motion Compensated Sequences}},
	year = {2015}
}

@inproceedings{jahangiri17,
	abstract = {  We propose a method to generate multiple diverse and valid human pose
hypotheses in 3D all consistent with the 2D detection of joints in a monocular
RGB image. We use a novel generative model uniform (unbiased) in the space of
anatomically plausible 3D poses. Our model is compositional (produces a pose by
combining parts) and since it is restricted only by anatomical constraints it
can generalize to every plausible human 3D pose. Removing the model bias
intrinsically helps to generate more diverse 3D pose hypotheses. We argue that
generating multiple pose hypotheses is more reasonable than generating only a
single 3D pose based on the 2D joint detection given the depth ambiguity and
the uncertainty due to occlusion and imperfect 2D joint detection. We hope that
the idea of generating multiple consistent pose hypotheses can give rise to a
new line of future work that has not received much attention in the literature.
We used the Human3.6M dataset for empirical evaluation.
},
	archiveprefix = {arXiv},
	author = {Jahangiri, Ehsan and Yuille, Alan L.},
	booktitle = {{2017 IEEE International Conference on Computer Vision Workshop}},
	eprint = {1702.02258v2},
	localfile = {bibliography_resources/jahangiri - generating multiple diverse hypotheses for hume 3d pose.pdf},
	month = aug,
	primaryclass = {cs.CV},
	title = {{Generating Multiple Diverse Hypotheses for Human 3D Pose Consistent with 2D Joint Detections}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@article{mehta17,
	abstract = {  We propose a CNN-based approach for 3D human body pose estimation from single
RGB images that addresses the issue of limited generalizability of models
trained solely on the starkly limited publicly available 3D pose data. Using
only the existing 3D pose data and 2D pose data, we show state-of-the-art
performance on established benchmarks through transfer of learned features,
while also generalizing to in-the-wild scenes. We further introduce a new
training set for human body pose estimation from monocular images of real
humans that has the ground truth captured with a multi-camera marker-less
motion capture system. It complements existing corpora with greater diversity
in pose, human appearance, clothing, occlusion, and viewpoints, and enables an
increased scope of augmentation. We also contribute a new benchmark that covers
outdoor and indoor scenes, and demonstrate that our 3D pose dataset shows
better in-the-wild performance than existing annotated data, which is further
improved in conjunction with transfer learning from 2D pose data. All in all,
we argue that the use of transfer learning of representations in tandem with
algorithmic and data contributions is crucial for general 3D body pose
estimation.
},
	archiveprefix = {arXiv},
	author = {Mehta, Dushyant and Rhodin, Helge and Casas, Dan and Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and Theobalt, Christian},
	comment = {published = 2016-11-29T20:03:19Z, updated = 2017-10-04T15:21:46Z, Accepted at the International Conference on 3D Vision (3DV) 2017},
	eprint = {1611.09813v5},
	journal = {2017 International Conference on 3D Vision (3DV)},
	localfile = {bibliography_resources/mehta - 3d pose estimation using improved cnn supervision.pdf},
	month = oct,
	primaryclass = {cs.CV},
	title = {{Monocular 3D Human Pose Estimation In The Wild Using Improved CNN Supervision}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@article{goodfellow17,
	abstract = {  This report summarizes the tutorial presented by the author at NIPS 2016 on
generative adversarial networks (GANs). The tutorial describes: (1) Why
generative modeling is a topic worth studying, (2) how generative models work,
and how GANs compare to other generative models, (3) the details of how GANs
work, (4) research frontiers in GANs, and (5) state-of-the-art image models
that combine GANs with other methods. Finally, the tutorial contains three
exercises for readers to complete, and the solutions to these exercises.
},
	archiveprefix = {arXiv},
	author = {Goodfellow, Ian},
	comment = {published = 2016-12-31T19:17:17Z, updated = 2017-04-03T21:57:48Z, v2-v4 are all typo fixes. No substantive changes relative to v1},
	eprint = {1701.00160v4},
	localfile = {bibliography_resources/goodfellow - nips 2016 tutorial: GAN.pdf},
	month = apr,
	primaryclass = {cs.LG},
	title = {{NIPS 2016 Tutorial: Generative Adversarial Networks}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@misc{gidel19,
	abstract = {  Games generalize the single-objective optimization paradigm by introducing
different objective functions for different players. Differentiable games often
proceed by simultaneous or alternating gradient updates. In machine learning,
games are gaining new importance through formulations like generative
adversarial networks (GANs) and actor-critic systems. However, compared to
single-objective optimization, game dynamics are more complex and less
understood. In this paper, we analyze gradient-based methods with momentum on
simple games. We prove that alternating updates are more stable than
simultaneous updates. Next, we show both theoretically and empirically that
alternating gradient updates with a negative momentum term achieves convergence
in a difficult toy adversarial problem, but also on the notoriously difficult
to train saturating GANs.
},
	archiveprefix = {arXiv},
	author = {Gidel, Gauthier and Hemmat, Reyhane Askari and Pezeshki, Mohammad and Lepriol, Remi and Huang, Gabriel and Lacoste-Julien, Simon and Mitliagkas, Ioannis},
	comment = {published = 2018-07-12T17:46:56Z, updated = 2019-03-27T16:32:10Z, Appears in: Proceedings of the 22nd International Conference on Artificial Intelligence and Statistics (AISTATS 2019). 25 pages},
	eprint = {1807.04740v3},
	localfile = {bibliography_resources/gidel - negative momentum.pdf},
	month = mar,
	primaryclass = {cs.LG},
	title = {{Negative Momentum for Improved Game Dynamics}},
	url = {http://arxiv.org/abs/1807.04740v3; http://arxiv.org/pdf/1807.04740v3},
	x-fetchedfrom = {arXiv.org},
	year = {2019}
}

@misc{pavllo19,
	abstract = {  In this work, we demonstrate that 3D poses in video can be effectively
estimated with a fully convolutional model based on dilated temporal
convolutions over 2D keypoints. We also introduce back-projection, a simple and
effective semi-supervised training method that leverages unlabeled video data.
We start with predicted 2D keypoints for unlabeled video, then estimate 3D
poses and finally back-project to the input 2D keypoints. In the supervised
setting, our fully-convolutional model outperforms the previous best result
from the literature by 6 mm mean per-joint position error on Human3.6M,
corresponding to an error reduction of 11\%, and the model also shows
significant improvements on HumanEva-I. Moreover, experiments with
back-projection show that it comfortably outperforms previous state-of-the-art
results in semi-supervised settings where labeled data is scarce. Code and
models are available at https://github.com/facebookresearch/VideoPose3D
},
	archiveprefix = {arXiv},
	author = {Pavllo, Dario and Feichtenhofer, Christoph and Grangier, David and Auli, Michael},
	comment = {published = 2018-11-28T18:56:36Z, updated = 2019-03-29T13:36:46Z, CVPR 2019},
	eprint = {1811.11742v2},
	localfile = {bibliography_resources/pavllo - 3d human pose estimation in video with temporal convolutions},
	month = mar,
	primaryclass = {cs.CV},
	title = {{3D human pose estimation in video with temporal convolutions and semi-supervised training}},
	url = {http://arxiv.org/abs/1811.11742v2; http://arxiv.org/pdf/1811.11742v2},
	x-fetchedfrom = {arXiv.org},
	year = {2019}
}

@inproceedings{chorowski14,
	abstract = {  We replace the Hidden Markov Model (HMM) which is traditionally used in in
continuous speech recognition with a bi-directional recurrent neural network
encoder coupled to a recurrent neural network decoder that directly emits a
stream of phonemes. The alignment between the input and output sequences is
established using an attention mechanism: the decoder emits each symbol based
on a context created with a subset of input symbols elected by the attention
mechanism. We report initial results demonstrating that this new approach
achieves phoneme error rates that are comparable to the state-of-the-art
HMM-based decoders, on the TIMIT dataset.
},
	archiveprefix = {arXiv},
	author = {Chorowski, Jan and Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	booktitle = {{NIPS 2014 Workshop on Deep Learning}},
	comment = {published = 2014-12-04T10:00:19Z, updated = 2014-12-04T10:00:19Z, As accepted to: Deep Learning and Representation Learning Workshop, NIPS 2014},
	eprint = {1412.1602v1},
	localfile = {bibliography_resources/chorowski - e2e continuous speech recognition.pdf},
	month = dec,
	primaryclass = {cs.NE},
	title = {{End-to-end Continuous Speech Recognition using Attention-based Recurrent NN: First Results}},
	x-fetchedfrom = {arXiv.org},
	year = {2014}
}

@article{tung17,
	abstract = {  Researchers have developed excellent feed-forward models that learn to map
images to desired outputs, such as to the images' latent factors, or to other
images, using supervised learning. Learning such mappings from unlabelled data,
or improving upon supervised models by exploiting unlabelled data, remains
elusive. We argue that there are two important parts to learning without
annotations: (i) matching the predictions to the input observations, and (ii)
matching the predictions to known priors. We propose Adversarial Inverse
Graphics networks (AIGNs): weakly supervised neural network models that combine
feedback from rendering their predictions, with distribution matching between
their predictions and a collection of ground-truth factors. We apply AIGNs to
3D human pose estimation and 3D structure and egomotion estimation, and
outperform models supervised by only paired annotations. We further apply AIGNs
to facial image transformation using super-resolution and inpainting renderers,
while deliberately adding biases in the ground-truth datasets. Our model
seamlessly incorporates such biases, rendering input faces towards young, old,
feminine, masculine or Tom Cruise-like equivalents (depending on the chosen
bias), or adding lip and nose augmentations while inpainting concealed lips and
noses.
},
	archiveprefix = {arXiv},
	author = {Tung, Hsiao-Yu Fish and Harley, Adam W. and Seto, William and Fragkiadaki, Katerina},
	comment = {published = 2017-05-31T16:30:07Z, updated = 2017-09-02T01:10:17Z},
	eprint = {1705.11166v3},
	localfile = {bibliography_resources/tung - adversarial inverse graphics networks.pdf},
	month = sep,
	pages = {The IEEE International Conference on Computer Vision },
	primaryclass = {cs.CV},
	title = {{Adversarial Inverse Graphics Networks: Learning 2D-to-3D Lifting and Image-to-Image Translation from Unpaired Supervision}},
	url = {http://arxiv.org/abs/1705.11166v3; http://arxiv.org/pdf/1705.11166v3},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@inproceedings{kingma17,
	abstract = {  We introduce Adam, an algorithm for first-order gradient-based optimization
of stochastic objective functions, based on adaptive estimates of lower-order
moments. The method is straightforward to implement, is computationally
efficient, has little memory requirements, is invariant to diagonal rescaling
of the gradients, and is well suited for problems that are large in terms of
data and/or parameters. The method is also appropriate for non-stationary
objectives and problems with very noisy and/or sparse gradients. The
hyper-parameters have intuitive interpretations and typically require little
tuning. Some connections to related algorithms, on which Adam was inspired, are
discussed. We also analyze the theoretical convergence properties of the
algorithm and provide a regret bound on the convergence rate that is comparable
to the best known results under the online convex optimization framework.
Empirical results demonstrate that Adam works well in practice and compares
favorably to other stochastic optimization methods. Finally, we discuss AdaMax,
a variant of Adam based on the infinity norm.
},
	archiveprefix = {arXiv},
	author = {Kingma, Diederik P. and Ba, Jimmy},
	booktitle = {{ICLR 2015}},
	comment = {published = 2014-12-22T13:54:29Z, updated = 2017-01-30T01:27:54Z, Published as a conference paper at the 3rd International Conference for Learning Representations, San Diego, 2015},
	eprint = {1412.6980v9},
	month = jan,
	primaryclass = {cs.LG},
	title = {{Adam: A Method for Stochastic Optimization}},
	x-fetchedfrom = {arXiv.org},
	year = {2017}
}

@inproceedings{trumble17,
	author = {Trumble, Matt and Gilbert, Andrew and Malleson, Charles and Hilton, Adrian and Collomosse, John},
	booktitle = {{2017 British Machine Vision Conference (BMVC)}},
	title = {{Total Capture: 3D Human Pose Estimation Fusing Video and Inertial Sensors}},
	year = {2017}
}

@inproceedings{grinciunaite16,
	abstract = {  This paper explores the capabilities of convolutional neural networks to deal
with a task that is easily manageable for humans: perceiving 3D pose of a human
body from varying angles. However, in our approach, we are restricted to using
a monocular vision system. For this purpose, we apply a convolutional neural
network approach on RGB videos and extend it to three dimensional convolutions.
This is done via encoding the time dimension in videos as the 3\ts{rd}
dimension in convolutional space, and directly regressing to human body joint
positions in 3D coordinate space. This research shows the ability of such a
network to achieve state-of-the-art performance on the selected Human3.6M
dataset, thus demonstrating the possibility of successfully representing
temporal data with an additional dimension in the convolutional operation.
},
	archiveprefix = {arXiv},
	author = {Grinciunaite, Agne and Gudi, Amogh and Tasli, Emrah and den Uyl, Marten},
	booktitle = {{Computer Vision -- ECCV 2016 Workshops}},
	comment = {published = 2016-08-31T20:55:26Z, updated = 2016-10-19T12:44:15Z, Accepted at ECCV 2016 Workshop on: Brave new ideas for motion representations in videos},
	doi = {10.1007/978-3-319-49409-8_5},
	eprint = {1609.00036v3},
	localfile = {bibliography_resources/grinciunaite16 - human pose estimation in space and time.pdf},
	month = oct,
	primaryclass = {cs.CV},
	title = {{Human Pose Estimation in Space and Time using 3D CNN}},
	x-fetchedfrom = {arXiv.org},
	year = {2016}
}

@misc{andriluka18,
	abstract = {  Human poses and motions are important cues for analysis of videos with people
and there is strong evidence that representations based on body pose are highly
effective for a variety of tasks such as activity recognition, content
retrieval and social signal processing. In this work, we aim to further advance
the state of the art by establishing "PoseTrack", a new large-scale benchmark
for video-based human pose estimation and articulated tracking, and bringing
together the community of researchers working on visual human analysis. The
benchmark encompasses three competition tracks focusing on i) single-frame
multi-person pose estimation, ii) multi-person pose estimation in videos, and
iii) multi-person articulated tracking. To facilitate the benchmark and
challenge we collect, annotate and release a new \%large-scale benchmark dataset
that features videos with multiple people labeled with person tracks and
articulated pose. A centralized evaluation server is provided to allow
participants to evaluate on a held-out test set. We envision that the proposed
benchmark will stimulate productive research both by providing a large and
representative training dataset as well as providing a platform to objectively
evaluate and compare the proposed methods. The benchmark is freely accessible
at https://posetrack.net.
},
	archiveprefix = {arXiv},
	author = {Andriluka, Mykhaylo and Iqbal, Umar and Insafutdinov, Eldar and Pishchulin, Leonid and Milan, Anton and Gall, Juergen and Schiele, Bernt},
	comment = {published = 2017-10-27T06:20:30Z, updated = 2018-04-10T18:20:56Z, www.posetrack.net},
	eprint = {1710.10000v2},
	month = apr,
	primaryclass = {cs.CV},
	title = {{PoseTrack: A Benchmark for Human Pose Estimation and Tracking}},
	url = {http://arxiv.org/abs/1710.10000v2; http://arxiv.org/pdf/1710.10000v2},
	x-fetchedfrom = {arXiv.org},
	year = {2018}
}

@misc{guler18,
	abstract = {  In this work, we establish dense correspondences between RGB image and a
surface-based representation of the human body, a task we refer to as dense
human pose estimation. We first gather dense correspondences for 50K persons
appearing in the COCO dataset by introducing an efficient annotation pipeline.
We then use our dataset to train CNN-based systems that deliver dense
correspondence 'in the wild', namely in the presence of background, occlusions
and scale variations. We improve our training set's effectiveness by training
an 'inpainting' network that can fill in missing groundtruth values and report
clear improvements with respect to the best results that would be achievable in
the past. We experiment with fully-convolutional networks and region-based
models and observe a superiority of the latter; we further improve accuracy
through cascading, obtaining a system that delivers highly0accurate results in
real time. Supplementary materials and videos are provided on the project page
http://densepose.org
},
	archiveprefix = {arXiv},
	author = {G{\"u}ler, R{\i}za Alp and Neverova, Natalia and Kokkinos, Iasonas},
	comment = {published = 2018-02-01T18:53:26Z, updated = 2018-02-01T18:53:26Z},
	eprint = {1802.00434v1},
	localfile = {guler  - dense pose.pdf},
	month = feb,
	primaryclass = {cs.CV},
	title = {{DensePose: Dense Human Pose Estimation In The Wild}},
	url = {http://arxiv.org/abs/1802.00434v1; http://arxiv.org/pdf/1802.00434v1},
	x-fetchedfrom = {arXiv.org},
	year = {2018}
}

@article{pavlakos17,
	author = {Pavlakos, Georgios and Zhou, Xiaowei and Derpanis, Konstantinos G. and Daniilidis, Kostas},
	journal = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {1263--1272},
	title = {{Coarse-to-Fine Volumetric Prediction for Single-Image 3D Human Pose}},
	year = {2016}
}

@article{aroeira16,
	abstract = {Summary Purpose Reviewing techniques for non-invasive postural evaluation of adolescent idiopathic scoliosis (AIS) based on information extraction from images based on computer methods. Methods The Scopus, Web of Science, MEDLINE, ScieLo and PubMed databases were used, for the period 2011--2015. Results 131 articles were found based on keyword of which 15 articles met the established eligibility criteria. Of these, 4 were based on photogrammetry, and 11 based on laser, structured light, ultrasound, and Moir{\'e} projection. In these studies, the methodological quality varied from low to high. Conclusions The findings indicated diversity in methodologies; 14/15 articles reviewed were limited to the evaluation of the topography of the posterior back. A study, using two-dimensional photogrammetry, presented a whole body postural evaluation. As the asymmetry in AIS can be extended to the whole body, more attention should be given to develop full body assessment techniques to provide important additional data to aid in treatment decisions.},
	author = {Aroeira, Rozilene Maria C. and de Casas, Estevam B. Las and Pertence, Ant{\^o}nio Eust{\'a}quio M. and Greco, Marcelo and Tavares, Jo{\~a}o Manuel R.S.},
	doi = {10.1016/j.jbmt.2016.02.004},
	journal = {Journal of Bodywork and Movement Therapies},
	keywords = {Body posture; Evaluation; Adolescent idiopathic scoliosis; Computational methods; Imaging},
	number = {4},
	pages = {832--843},
	title = {{Non-invasive methods of computer vision in the posture evaluation of adolescent idiopathic scoliosis}},
	volume = {20},
	year = {2016}
}

@article{khan18,
	abstract = {A neurological illness is t he disorder in human nervous system that can result in various diseases including the motor disabilities. Neurological disorders may affect the motor neurons, which are associated with skeletal muscles and control the body movement. Consequently, they introduce some diseases in the human e.g. cerebral palsy, spinal scoliosis, peripheral paralysis of arms/legs, hip joint dysplasia and various myopathies. Vojta therapy is considered a useful technique to treat the motor disabilities. In Vojta therapy, a specific stimulation is given to the patient's body to perform certain reflexive pattern movements which the patient is unable to perform in a normal manner. The repetition of stimulation ultimately brings forth the previously blocked connections between the spinal cord and the brain. After few therapy sessions, the patient can perform these movements without external stimulation. In this paper, we propose a computer vision-based system to monitor the correct movements of the patient during the therapy treatment using the RGBD data. The proposed framework works in three steps. In the first step, patient's body is automatically detected and segmented and two novel techniques are proposed for this purpose. In the second step, a multi-dimensional feature vector is computed to define various movements of patient's body during the therapy. In the final step, a multi-class support vector machine is used to classify these movements. The experimental evaluation carried out on the large captured dataset shows that the proposed system is highly useful in monitoring the patient's body movements during Vojta therapy.},
	author = {Khan, Muhammad Hassan and Helsper, Julien and Farid, Muhammad Shahid and Grzegorzek, Marcin},
	doi = {10.1016/j.ijmedinf.2018.02.010},
	journal = {International Journal of Medical Informatics},
	keywords = {Vojta therapy; Cerebral palsy; Spinal scoliosis; Musculoskeletal system; Computer vision; Microsoft Kinect},
	localfile = {bibliography_resources/khan - a computer vision based system for monitoring vojta therapy.pdf},
	pages = {85--95},
	title = {{A computer vision-based system for monitoring Vojta therapy}},
	volume = {113},
	year = {2018}
}

@inproceedings{richter15,
	abstract = {In European countries, the increasing number of elderly with dementia causes serious problems for the society, especially with regard to the caring sector. As technical support systems can be of assistance to caregivers and patients, a mobility assessment system for demented people is presented. The grade of mobility is measured by means of the person's pose and movements in a monitored area. For this purpose, pose estimation and movement detection algorithms have been developed. These algorithms process 3-D data, which are provided by an optical stereo sensor installed in a living environment. The experiments demonstrated that the algorithms work robustly. In connection with a human machine interface, the system facilitates a mobilisation as well as a more valid assessment of the patient's medical condition than it is presently the case. Moreover, recent advances with regard to action recognition as well as an outlook about necessary developments are presented.},
	address = {Cham},
	author = {Richter, Julia and Wiede, Christian and Hirtz, Gangolf},
	booktitle = {{Pattern Recognition: Applications and Methods}},
	editor = {Fred, Ana and {De Marsico}, Maria and Figueiredo, M{\'a}rio},
	isbn = {978-3-319-27677-9},
	pages = {172--184},
	publisher = {Springer International Publishing},
	title = {{Pose Estimation and Movement Detection for Mobility Assessment of Elderly People in an Ambient Assisted Living Application}},
	year = {2015}
}

@article{pishchulin16,
	author = {Pishchulin, Leonid and Insafutdinov, Eldar and Tang, Siyu and Andres, Bjoern and Andriluka, Mykhaylo and Gehler, Peter V. and Schiele, Bernt},
	journal = {2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
	pages = {4929--4937},
	title = {{DeepCut: Joint Subset Partition and Labeling for Multi Person Pose Estimation}},
	year = {2015}
}

@inproceedings{newell16,
	abstract = {  This work introduces a novel convolutional network architecture for the task
of human pose estimation. Features are processed across all scales and
consolidated to best capture the various spatial relationships associated with
the body. We show how repeated bottom-up, top-down processing used in
conjunction with intermediate supervision is critical to improving the
performance of the network. We refer to the architecture as a "stacked
hourglass" network based on the successive steps of pooling and upsampling that
are done to produce a final set of predictions. State-of-the-art results are
achieved on the FLIC and MPII benchmarks outcompeting all recent methods.
},
	archiveprefix = {arXiv},
	author = {Newell, Alejandro and Yang, Kaiyu and Deng, Jia},
	booktitle = {{Computer Vision -- ECCV 2016}},
	comment = {published = 2016-03-22T19:56:42Z, updated = 2016-07-26T19:19:37Z},
	eprint = {1603.06937v2},
	month = jul,
	primaryclass = {cs.CV},
	title = {{Stacked Hourglass Networks for Human Pose Estimation}},
	x-fetchedfrom = {arXiv.org},
	year = {2016}
}

@misc{cao18,
	abstract = {  Realtime multi-person 2D pose estimation is a key component in enabling
machines to have an understanding of people in images and videos. In this work,
we present a realtime approach to detect the 2D pose of multiple people in an
image. The proposed method uses a nonparametric representation, which we refer
to as Part Affinity Fields (PAFs), to learn to associate body parts with
individuals in the image. This bottom-up system achieves high accuracy and
realtime performance, regardless of the number of people in the image. In
previous work, PAFs and body part location estimation were refined
simultaneously across training stages. We demonstrate that a PAF-only
refinement rather than both PAF and body part location refinement results in a
substantial increase in both runtime performance and accuracy. We also present
the first combined body and foot keypoint detector, based on an internal
annotated foot dataset that we have publicly released. We show that the
combined detector not only reduces the inference time compared to running them
sequentially, but also maintains the accuracy of each component individually.
This work has culminated in the release of OpenPose, the first open-source
realtime system for multi-person 2D pose detection, including body, foot, hand,
and facial keypoints.
},
	archiveprefix = {arXiv},
	author = {Cao, Zhe and Hidalgo, Gines and Simon, Tomas and Wei, Shih-En and Sheikh, Yaser},
	comment = {published = 2018-12-18T18:50:33Z, updated = 2019-05-30T23:46:18Z, Journal version of arXiv:1611.08050, with better accuracy and faster speed, release a new foot keypoint dataset: https://cmu-perceptual-computing-lab.github.io/foot\_keypoint\_dataset/},
	eprint = {1812.08008v2},
	month = may,
	primaryclass = {cs.CV},
	title = {{OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields}},
	x-fetchedfrom = {arXiv.org},
	year = {2018}
}

@inproceedings{einfalt18,
	abstract = {In this paper we consider the problem of human pose estimation in real-world videos of swimmers. Swimming channels allow filming swimmers simultaneously above and below the water surface with a single stationary camera. These recordings can be used to quantitatively assess the athletes' performance. The quantitative evaluation, so far, requires manual annotations of body parts in each video frame. We therefore apply the concept of CNNs in order to automatically infer the required pose information. Starting with an off-the-shelf architecture, we develop extensions to leverage activity information - in our case the swimming style of an athlete - and the continuous nature of the video recordings. Our main contributions are threefold: (a) We apply and evaluate a fine-tuned Convolutional Pose Machine architecture as a baseline in our very challenging aquatic environment and discuss its error modes, (b) we propose an extension to input swimming style information into the fully convolutional architecture and (c) modify the architecture for continuous pose estimation in videos. With these additions we achieve reliable pose estimates with up to +16\% more correct body joint detections compared to the baseline architecture.},
	archiveprefix = {arXiv},
	author = {Einfalt, Moritz and Zecha, Dan and Lienhart, Rainer},
	booktitle = {{2018 IEEE Winter Conference on Applications of Computer Vision}},
	comment = {published = 2018-02-02T10:56:41Z, updated = 2018-02-02T10:56:41Z, 10 pages, 9 figures, accepted at WACV 2018},
	eprint = {1802.00634v1},
	month = mar,
	primaryclass = {cs.CV},
	title = {{Activity-conditioned continuous human pose estimation for performance analysis of athletes using the example of swimming}},
	x-fetchedfrom = {arXiv.org},
	year = {2018}
}

@inproceedings{zecha19,
	author = {Zecha, Dan and Einfalt, Moritz and Lienhart, Rainer},
	booktitle = {{CVPR 2019}},
	title = {{Refining Joint Locations for Human Pose Tracking in Sports Videos}},
	year = {2019}
}

@inbook{park16,
	abstract = {While there has been a success in 2D human pose estimation with convolutional neural networks (CNNs), 3D human pose estimation has not been thoroughly studied. In this paper, we tackle the 3D human pose estimation task with end-to-end learning using CNNs. Relative 3D positions between one joint and the other joints are learned via CNNs. The proposed method improves the performance of CNN with two novel ideas. First, we added 2D pose information to estimate a 3D pose from an image by concatenating 2D pose estimation result with the features from an image. Second, we have found that more accurate 3D poses are obtained by combining information on relative positions with respect to multiple joints, instead of just one root joint. Experimental results show that the proposed method achieves comparable performance to the state-of-the-art methods on Human 3.6m dataset.},
	author = {Park, Sungheon and Hwang, Jihye and Kwak, Nojun},
	booktitle = {{Computer Vision -- ECCV 2016 Workshops}},
	doi = {10.1007/978-3-319-49409-8_15},
	publisher = {Springer},
	title = {{3D Human Pose Estimation Using Convolutional Neural Networks with 2D Pose Information}},
	x-fetchedfrom = {SpringerLink},
	year = {2016}
}

@article{mehta17_2,
	author = {Mehta, Dushyant and Sridhar, Srinath and Sotnychenko, Oleksandr and Rhodin, Helge and Shafiei, Mohammad and Seidel, Hans-Peter and Xu, Weipeng and Casas, Dan and Theobalt, Christian},
	journal = {ACM Transactions on Graphics},
	month = {July},
	number = {4},
	numpages = {14},
	title = {{VNect: Real-time 3D Human Pose Estimation with a Single RGB Camera}},
	volume = {36},
	year = {2017}
}

@misc{wei16,
	abstract = {  Pose Machines provide a sequential prediction framework for learning rich
implicit spatial models. In this work we show a systematic design for how
convolutional networks can be incorporated into the pose machine framework for
learning image features and image-dependent spatial models for the task of pose
estimation. The contribution of this paper is to implicitly model long-range
dependencies between variables in structured prediction tasks such as
articulated pose estimation. We achieve this by designing a sequential
architecture composed of convolutional networks that directly operate on belief
maps from previous stages, producing increasingly refined estimates for part
locations, without the need for explicit graphical model-style inference. Our
approach addresses the characteristic difficulty of vanishing gradients during
training by providing a natural learning objective function that enforces
intermediate supervision, thereby replenishing back-propagated gradients and
conditioning the learning procedure. We demonstrate state-of-the-art
performance and outperform competing methods on standard benchmarks including
the MPII, LSP, and FLIC datasets.
},
	archiveprefix = {arXiv},
	author = {Wei, Shih-En and Ramakrishna, Varun and Kanade, Takeo and Sheikh, Yaser},
	comment = {published = 2016-01-30T16:15:28Z, updated = 2016-04-12T03:31:53Z, camera ready},
	eprint = {1602.00134v4},
	month = apr,
	primaryclass = {cs.CV},
	title = {{Convolutional Pose Machines}},
	url = {http://arxiv.org/abs/1602.00134v4; http://arxiv.org/pdf/1602.00134v4},
	x-fetchedfrom = {arXiv.org},
	year = {2016}
}

@inbook{wandt18,
	abstract = {This paper deals with motion capture of kinematic chains (e.g. human skeletons) from monocular image sequences taken by uncalibrated cameras. We present a method based on projecting an observation onto a kinematic chain space (KCS). An optimization of the nuclear norm is proposed that implicitly enforces structural properties of the kinematic chain. Unlike other approaches our method is not relying on training data or previously determined constraints such as particular body lengths. The proposed algorithm is able to reconstruct scenes with little or no camera motion and previously unseen motions. It is not only applicable to human skeletons but also to other kinematic chains for instance animals or industrial robots. We achieve state-of-the-art results on different benchmark databases and real world scenes.},
	author = {Wandt, Bastian and Ackermann, Hanno and Rosenhahn, Bodo},
	booktitle = {{Computer Vision -- ECCV 2018 Workshops}},
	doi = {10.1007/978-3-030-11018-5_4},
	isbn = {978-3-030-11018-5},
	journal = {ECCV Workshops},
	localfile = {bibliography_resources/wandt - a kinematic chain space for monocular motion capture.pdf},
	publisher = {Springer},
	title = {{A Kinematic Chain Space for Monocular Motion Capture}},
	url = {http://dx.doi.org/10.1007/978-3-030-11018-5_4},
	x-fetchedfrom = {SpringerLink},
	year = {2018}
}