provectus · adelhighrullin · Oct 10, 2021
diff --git a/dataeng/processed_data/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/dataeng/processed_data/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,216 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "path_to_folder = \"../02-src-data/\"\n",
+    "path_to_img = \"https://raw.githubusercontent.com/provectus/internship/main/dataeng/02-src-data/\"\n",
+    "src_data = os.listdir(path_to_folder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_csv, data_img, user_ids = set(), set(), set()\n",
+    "for file in src_data:\n",
+    "    if file.endswith(\".csv\"):\n",
+    "        user_ids.add(file[:-4])\n",
+    "        data_csv.add(file)\n",
+    "    else:\n",
+    "        data_img.add(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "output_data = pd.DataFrame(columns=['id'])\n",
+    "for csv in data_csv:\n",
+    "    output_data.append({'id': csv[:-4]}, ignore_index=True)\n",
+    "    \n",
+    "    csv_file = pd.read_csv(path_to_folder + csv)\n",
+    "#     row = pd.DataFrame(csv_file)\n",
+    "    output_data = output_data.append(csv_file, ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>birthts</th>\n",
+       "      <th>last_name</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>5.065524e+11</td>\n",
+       "      <td>Kirk</td>\n",
+       "      <td>Robert</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>7.857108e+11</td>\n",
+       "      <td>Smith</td>\n",
+       "      <td>Amy</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3.400596e+11</td>\n",
+       "      <td>Quick</td>\n",
+       "      <td>William</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>9.712080e+11</td>\n",
+       "      <td>Miller</td>\n",
+       "      <td>Chad</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7.858836e+11</td>\n",
+       "      <td>Hardin</td>\n",
+       "      <td>Christopher</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>6.706260e+11</td>\n",
+       "      <td>Garcia</td>\n",
+       "      <td>Rosa</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>9.559152e+11</td>\n",
+       "      <td>Raymond</td>\n",
+       "      <td>Adolph</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>1.018469e+12</td>\n",
+       "      <td>Shoup</td>\n",
+       "      <td>Jeanne</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>1.074805e+12</td>\n",
+       "      <td>Miller</td>\n",
+       "      <td>Dino</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>8.213940e+11</td>\n",
+       "      <td>Folkers</td>\n",
+       "      <td>Ellen</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         birthts  last_name   first_name   id\n",
+       "0   5.065524e+11       Kirk       Robert  NaN\n",
+       "1   7.857108e+11      Smith          Amy  NaN\n",
+       "2   3.400596e+11      Quick      William  NaN\n",
+       "3   9.712080e+11     Miller         Chad  NaN\n",
+       "4   7.858836e+11     Hardin  Christopher  NaN\n",
+       "..           ...        ...          ...  ...\n",
+       "95  6.706260e+11     Garcia         Rosa  NaN\n",
+       "96  9.559152e+11    Raymond       Adolph  NaN\n",
+       "97  1.018469e+12      Shoup       Jeanne  NaN\n",
+       "98  1.074805e+12     Miller         Dino  NaN\n",
+       "99  8.213940e+11    Folkers        Ellen  NaN\n",
+       "\n",
+       "[100 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 114,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_data"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/dataeng/processed_data/answers.txt b/dataeng/processed_data/answers.txt
@@ -0,0 +1,108 @@
+===================== PYTHON ======================
+
+1. Optimise execution time of this Python code snippet:
+
+def count_connections(list1: list, list2: list) -> int:
+	return len(list(set(list1) & set(list2)))
+
+2. Given a string S, find the length of the longest substring without repeating characters. Analyze your solution and please provide Space and Time complexities.
+
+def longestSubstring(s: str) -> int:
+	stored = ""
+	searched = ""
+	for letter in s:
+		if letter not in searched:
+			searched += letter
+		else:
+			if len(stored) < len(searched):
+				stored = searched
+			searched += letter
+			searched = searched[searched.index(letter)+1:]
+	return(max(len(stored), len(searched)))
+
+3. Given a sorted array of distinct integers and a target value, return the index if the target is found. If not, return the index where it would be if it were inserted in order.
+
+def foo(t, arr):
+	if t not in arr:
+		prevNum = min(arr, key=lambda x:abs(x-t))
+		if prevNum > t:
+			return arr.index(prevNum)
+		return arr.index(prevNum) + 1
+  return arr.index(t)
+
+===================== SQL =======================
+
+1. Rewrite this SQL without subquery:
+
+SELECT id
+FROM users
+WHERE id NOT IN (
+	SELECT user_id
+	FROM departments
+	WHERE department_id = 1
+);
+
+SELECT id
+FROM users
+JOIN departments ON id=user_id AND department_id!=1
+
+2. Write a SQL query to find all duplicate lastnames in a table named user
+
++----+-----------+-----------
+| id | firstname | lastname |
++----+-----------+-----------
+| 1  | Ivan      | Sidorov  |
+| 2  | Alexandr  | Ivanov   |
+| 3  | Petr      | Petrov   |
+| 4  | Stepan    | Ivanov   |
++----+-----------+----------+
+
+SELECT lastname
+FROM user
+GROUP BY lastname
+HAVING COUNT(lastname)>1;
+
+3. Write a SQL query to get a username from the user table with the second highest salary from salary tables. Show the username and it's salary in the result.
+
++---------+--------+
+| user_id | salary |
++----+--------+----+
+| 1       | 1000   |
+| 2       | 1100   |
+| 3       | 900    |
+| 4       | 1200   |
++---------+--------+
+
++---------+--------+
+| id | username    |
++----+--------+----+
+| 1  | Alex       |
+| 2  | Maria      |
+| 3  | Bob        |
+| 4  | Sean       |
++---------+-------+
+
+SELECT username, salary
+FROM user, salary
+WHERE user_id=id
+ORDER BY salary DESC
+LIMIT 1 OFFSET 1;
+
+===================== LINUX ======================
+
+1. List processes listening on ports 80 and 443
+
+ss -tunlp | grep :80
+ss -tunlp | grep :443
+
+2. List process environment variables by given PID
+
+cat /proc/<pid>/environ
+
+3. Launch a python program my_program.py through CLI in the background. How would you close it after some period of time?
+
+- get pid:
+	ps aux | grep my_program.py
+
+- kill the process:
+	kill -9 <pid_of_the_process>
diff --git a/dataeng/processed_data/level 1/README.md b/dataeng/processed_data/level 1/README.md
@@ -0,0 +1 @@
+The main script makes requests to given folder, processes data and by using Pandas library makes a dataframe with necessary information and saves the dataframe to csv-file named output.csv.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The main script makes requests to given folder, processes data and by using Pandas library makes a dataframe with necessary information and saves the dataframe to csv-file named output.csv.