-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathDockerfile-hadoop
127 lines (108 loc) · 3.42 KB
/
Dockerfile-hadoop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
FROM openjdk:11-jdk AS jdk
FROM python:3.11
USER root
# --------------------------------------------------------
# JAVA
# --------------------------------------------------------
RUN apt update
RUN apt-get install -y --no-install-recommends \
python3-launchpadlib \
software-properties-common
# RUN add-apt-repository ppa:openjdk-r/ppa
# RUN apt update
# RUN apt install -y --no-install-recommends \
# openjdk-8-jdk
# For AMD based architecture use
# ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/
COPY --from=jdk /usr/local/openjdk-11 /usr/lib/jvm/java-11-openjdk-arm64/
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-arm64/
# --------------------------------------------------------
# HADOOP
# --------------------------------------------------------
ENV HADOOP_VERSION=3.3.6
ENV HADOOP_URL=https://downloads.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
ENV HADOOP_PREFIX=/opt/hadoop-$HADOOP_VERSION
ENV HADOOP_CONF_DIR=/etc/hadoop
ENV MULTIHOMED_NETWORK=1
ENV USER=root
ENV HADOOP_HOME=/opt/hadoop-$HADOOP_VERSION
ENV PATH $HADOOP_PREFIX/bin/:$PATH
ENV PATH $HADOOP_HOME/bin/:$PATH
RUN set -x \
&& curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \
&& tar -xvf /tmp/hadoop.tar.gz -C /opt/ \
&& rm /tmp/hadoop.tar.gz*
RUN ln -s /opt/hadoop-$HADOOP_VERSION/etc/hadoop /etc/hadoop
RUN mkdir /opt/hadoop-$HADOOP_VERSION/logs
RUN mkdir /hadoop-data
USER root
ADD entrypoint.sh /entrypoint.sh
RUN chmod a+x /entrypoint.sh
COPY conf/core-site.xml $HADOOP_CONF_DIR/core-site.xml
COPY conf/hdfs-site.xml $HADOOP_CONF_DIR/hdfs-site.xml
COPY conf/mapred-site.xml $HADOOP_CONF_DIR/mapred-site.xml
COPY conf/yarn-site.xml $HADOOP_CONF_DIR/yarn-site.xml
# ADD hadoop.env /hadoop.env
# RUN set -x && cd / && ./entrypoint.sh
# --------------------------------------------------------
# SPARK
# --------------------------------------------------------
ENV SPARK_VERSION spark-3.5.0
ENV SPARK_URL https://downloads.apache.org/spark/${SPARK_VERSION}/${SPARK_VERSION}-bin-hadoop3.tgz
ENV SPARK_HOME=/opt/$SPARK_VERSION
ENV PATH $SPARK_HOME/bin:$PATH
# ENV HADOOP_CONF_DIR=$SPARK_HOME/conf
ENV PYSPARK_PYTHON=python3
ENV PYTHONHASHSEED=1
RUN set -x \
&& curl -fSL "${SPARK_URL}" -o /tmp/spark.tar.gz \
&& tar -xvzf /tmp/spark.tar.gz -C /opt/ \
&& rm /tmp/spark.tar.gz* \
&& mv /opt/${SPARK_VERSION}-bin-hadoop3 /opt/${SPARK_VERSION}
ADD conf/core-site.xml $SPARK_HOME/conf
ADD conf/yarn-site.xml $SPARK_HOME/conf
#=========
# INSTALL PYTHON DEPS
#=========
RUN apt-get update && add-apt-repository ppa:deadsnakes/ppa \
&& apt-get install -y --no-install-recommends \
gcc \
g++ \
subversion \
python3-dev \
gfortran \
build-essential \
libopenblas-dev \
liblapack-dev \
libqpdf-dev \
pkg-config \
libzbar-dev \
python3-dev \
libpython3-dev \
qpdf \
xvfb \
gconf-service \
libasound2 \
libatk1.0-0 \
libcairo2 \
libcups2 \
libfontconfig1 \
libgdk-pixbuf2.0-0 \
libgtk-3-0 \
libnspr4 \
libpango-1.0-0 \
libxss1 \
fonts-liberation \
libappindicator1 \
libnss3 \
lsb-release \
xdg-utils \
wget \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --default-timeout=100 --upgrade pip
RUN pip install pikepdf Cython numpy wheel setuptools --force-reinstall
ADD requirements.txt /requirements.txt
# run install
RUN pip install -r /requirements.txt