Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 216 additions & 0 deletions dataeng/processed_data/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"path_to_folder = \"../02-src-data/\"\n",
"path_to_img = \"https://raw.githubusercontent.com/provectus/internship/main/dataeng/02-src-data/\"\n",
"src_data = os.listdir(path_to_folder)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"data_csv, data_img, user_ids = set(), set(), set()\n",
"for file in src_data:\n",
" if file.endswith(\".csv\"):\n",
" user_ids.add(file[:-4])\n",
" data_csv.add(file)\n",
" else:\n",
" data_img.add(file)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"output_data = pd.DataFrame(columns=['id'])\n",
"for csv in data_csv:\n",
" output_data.append({'id': csv[:-4]}, ignore_index=True)\n",
" \n",
" csv_file = pd.read_csv(path_to_folder + csv)\n",
"# row = pd.DataFrame(csv_file)\n",
" output_data = output_data.append(csv_file, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>birthts</th>\n",
" <th>last_name</th>\n",
" <th>first_name</th>\n",
" <th>id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.065524e+11</td>\n",
" <td>Kirk</td>\n",
" <td>Robert</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7.857108e+11</td>\n",
" <td>Smith</td>\n",
" <td>Amy</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.400596e+11</td>\n",
" <td>Quick</td>\n",
" <td>William</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9.712080e+11</td>\n",
" <td>Miller</td>\n",
" <td>Chad</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7.858836e+11</td>\n",
" <td>Hardin</td>\n",
" <td>Christopher</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>6.706260e+11</td>\n",
" <td>Garcia</td>\n",
" <td>Rosa</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>9.559152e+11</td>\n",
" <td>Raymond</td>\n",
" <td>Adolph</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>1.018469e+12</td>\n",
" <td>Shoup</td>\n",
" <td>Jeanne</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>1.074805e+12</td>\n",
" <td>Miller</td>\n",
" <td>Dino</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>8.213940e+11</td>\n",
" <td>Folkers</td>\n",
" <td>Ellen</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" birthts last_name first_name id\n",
"0 5.065524e+11 Kirk Robert NaN\n",
"1 7.857108e+11 Smith Amy NaN\n",
"2 3.400596e+11 Quick William NaN\n",
"3 9.712080e+11 Miller Chad NaN\n",
"4 7.858836e+11 Hardin Christopher NaN\n",
".. ... ... ... ...\n",
"95 6.706260e+11 Garcia Rosa NaN\n",
"96 9.559152e+11 Raymond Adolph NaN\n",
"97 1.018469e+12 Shoup Jeanne NaN\n",
"98 1.074805e+12 Miller Dino NaN\n",
"99 8.213940e+11 Folkers Ellen NaN\n",
"\n",
"[100 rows x 4 columns]"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
108 changes: 108 additions & 0 deletions dataeng/processed_data/answers.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
===================== PYTHON ======================

1. Optimise execution time of this Python code snippet:

def count_connections(list1: list, list2: list) -> int:
return len(list(set(list1) & set(list2)))

2. Given a string S, find the length of the longest substring without repeating characters. Analyze your solution and please provide Space and Time complexities.

def longestSubstring(s: str) -> int:
stored = ""
searched = ""
for letter in s:
if letter not in searched:
searched += letter
else:
if len(stored) < len(searched):
stored = searched
searched += letter
searched = searched[searched.index(letter)+1:]
return(max(len(stored), len(searched)))

3. Given a sorted array of distinct integers and a target value, return the index if the target is found. If not, return the index where it would be if it were inserted in order.

def foo(t, arr):
if t not in arr:
prevNum = min(arr, key=lambda x:abs(x-t))
if prevNum > t:
return arr.index(prevNum)
return arr.index(prevNum) + 1
return arr.index(t)

===================== SQL =======================

1. Rewrite this SQL without subquery:

SELECT id
FROM users
WHERE id NOT IN (
SELECT user_id
FROM departments
WHERE department_id = 1
);

SELECT id
FROM users
JOIN departments ON id=user_id AND department_id!=1

2. Write a SQL query to find all duplicate lastnames in a table named user

+----+-----------+-----------
| id | firstname | lastname |
+----+-----------+-----------
| 1 | Ivan | Sidorov |
| 2 | Alexandr | Ivanov |
| 3 | Petr | Petrov |
| 4 | Stepan | Ivanov |
+----+-----------+----------+

SELECT lastname
FROM user
GROUP BY lastname
HAVING COUNT(lastname)>1;

3. Write a SQL query to get a username from the user table with the second highest salary from salary tables. Show the username and it's salary in the result.

+---------+--------+
| user_id | salary |
+----+--------+----+
| 1 | 1000 |
| 2 | 1100 |
| 3 | 900 |
| 4 | 1200 |
+---------+--------+

+---------+--------+
| id | username |
+----+--------+----+
| 1 | Alex |
| 2 | Maria |
| 3 | Bob |
| 4 | Sean |
+---------+-------+

SELECT username, salary
FROM user, salary
WHERE user_id=id
ORDER BY salary DESC
LIMIT 1 OFFSET 1;

===================== LINUX ======================

1. List processes listening on ports 80 and 443

ss -tunlp | grep :80
ss -tunlp | grep :443

2. List process environment variables by given PID

cat /proc/<pid>/environ

3. Launch a python program my_program.py through CLI in the background. How would you close it after some period of time?

- get pid:
ps aux | grep my_program.py

- kill the process:
kill -9 <pid_of_the_process>
1 change: 1 addition & 0 deletions dataeng/processed_data/level 1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The main script makes requests to given folder, processes data and by using Pandas library makes a dataframe with necessary information and saves the dataframe to csv-file named output.csv.
Loading