Hadoop version
Hadoop fs -mkdir path
Hadoop fs -ls path
Hadoop fs -put source path
Hadoop fs -cat file.txt
Hadoop fs -touchz file.txt
Hadoop fsck path
Hadoop fs -df path
Hadoop fs
Echo “text” > file.txt
Hadoop fs -moveFromLocal src dest
Hadoop fs -rm path
Hadoop fs -cp src dest
Hadoop fs -mv src dest
Hadoop fs -setrep -w 2 file.txt
Hadoop-expunge
Docker run -d —name namenode hadoop-namenode
Docker run -d —name datanode hadoop-datanode
Docker exec -it namenode bash
Cd /root
Docker cp input.txt namenode:/root
Hadoop fs -put input.txt /input
Hadoop jar wordcounter.jar /input /output
Hadoop fs -ls /output
Hadoop fs -cat /output/part-00000
FROM bitnami/spark:latest
WORKDIR /app
COPY max_temperature.py .
COPY temperature_data.csv .
CMD ["spark-submit", "max_temperature.py"]
Build and run
docker build -t spark-max-temp .
docker run -it spark-max-temp
max_temperature.py
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder \
.appName("Max Temperature with Year") \
.getOrCreate()
data = spark.read.csv("temperature_data.csv", header = True, inferSchema = True)
max_temp_row = data.orderBy(data["temperature"].desc()).first()
if max_temp_row:
max_temp_df = spark.createDataFrame([
Row(year = max_temp_row["year"], max_temperature = max_temp_row["temperature"])
])
max_temp_df.show()
else:
print("No temperature data available")
spark.stop()
docker exec -it hive-hive-server hive
version: "3"
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
environment:
- CLUSTER_NAME=hadoop-cluster
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HDFS_CONF_dfs_replication=2
ports:
- "9870:9870"
- "8020:8020"
volumes:
- namenode_data:/hadoop/dfs/name
datanode:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode
environment:
- CLUSTER_NAME=hadoop-cluster
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HDFS_CONF_dfs_replication=2
ports:
- "9864:9864"
depends_on:
- namenode
volumes:
- datanode_data:/hadoop/dfs/data
hive-metastore-postgresql:
# Use official Postgres to avoid the broken init scripts in bde image
image: postgres:12
container_name: hive-metastore-postgresql
environment:
POSTGRES_DB: metastore
POSTGRES_USER: hive
POSTGRES_PASSWORD: hive
ports:
- "5432:5432"
volumes:
- ./metastore_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U hive -d metastore"]
interval: 5s
timeout: 5s
retries: 12
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: hive-metastore
environment:
- SERVICE_PRECONDITION=namenode:8020 datanode:9864 hive-metastore-postgresql:5432
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HIVE_CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql:5432/metastore
- HIVE_CORE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
- HIVE_CORE_CONF_javax_jdo_option_ConnectionUserName=hive
- HIVE_CORE_CONF_javax_jdo_option_ConnectionPassword=hive
command: /opt/hive/bin/hive --service metastore
ports:
- "9083:9083"
depends_on:
- hive-metastore-postgresql
- namenode
hive-server:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: hive-server
environment:
- SERVICE_PRECONDITION=hive-metastore:9083
- CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HIVE_CORE_CONF_fs_defaultFS=hdfs://namenode:8020
- HIVE_CORE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql:5432/metastore
- HIVE_CORE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
- HIVE_CORE_CONF_javax_jdo_option_ConnectionUserName=hive
- HIVE_CORE_CONF_javax_jdo_option_ConnectionPassword=hive
ports:
- "10000:10000"
depends_on:
- hive-metastore
- namenode
volumes:
namenode_data:
datanode_data:
hive_postgres_data:
docker-compose up -d
docker exec -it namenode bash