Skip to content

Commit 434b89f

Browse files
committed
først commit for mvp
0 parents  commit 434b89f

File tree

4 files changed

+95
-0
lines changed

4 files changed

+95
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
data/
2+
.env
3+
poetry.lock

CODEOWNERS

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* @navikt/personbruker

main.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# %%
2+
import os
3+
import re
4+
5+
import pandas as pd
6+
7+
# %%
8+
def find_substring(substring: str, df):
9+
"""
10+
Finn rader i en dataframe som inneholder en string
11+
"""
12+
mask = df.applymap(
13+
lambda x: substring in x.lower() if isinstance(x, str) else False
14+
).to_numpy()
15+
df_results = df.loc[mask]
16+
return df_results
17+
18+
19+
# %%
20+
def find_substring_index(substring: str, df):
21+
"""
22+
Finn index for rader som inneholder en string i en dataframe
23+
"""
24+
mask = df.applymap(
25+
lambda x: substring in x.lower() if isinstance(x, str) else False
26+
).to_numpy()
27+
df_results = df.loc[mask]
28+
return df_results
29+
30+
31+
# %%
32+
def find_substring_regex(regex: str, df, case=False):
33+
"""
34+
Finn rader i en dataframe der innholdet matcher regulæruttrykket
35+
"""
36+
textlikes = df.select_dtypes(include=[object, "string"])
37+
return df[
38+
textlikes.apply(
39+
lambda column: column.str.contains(regex, regex=True, case=case, na=False)
40+
).any(axis=1)
41+
]
42+
43+
44+
# %%
45+
df = pd.read_excel("data/final/merged.xlsx")
46+
# %%
47+
def finn_personer(df):
48+
"""
49+
Finn rader som kan inneholde personopplysninger og returnér en dataframe med treff
50+
"""
51+
dfs = []
52+
df_mvh = find_substring("mvh", df)
53+
df_hilsen = find_substring("hilsen", df)
54+
df_epost = find_substring("@", df)
55+
df_tlfnummer = find_substring_regex(r"^((0047)?|(\+47)?)[4|9]\d{7}$", df, False)
56+
df_tlfnummer_space = find_substring_regex(r"^[4|9]\d{2} \d{2} \d{3}$", df, False)
57+
dfs.append(df_mvh)
58+
dfs.append(df_hilsen)
59+
dfs.append(df_epost)
60+
dfs.append(df_tlfnummer)
61+
dfs.append(df_tlfnummer_space)
62+
df2 = pd.concat(dfs).drop_duplicates()
63+
return df2
64+
65+
# %%
66+
df2 = finn_personer(df)
67+
len(df2) # antall rader som inneholder navn, tlf eller epost med hilsen
68+
# %%
69+
(len(df2) / len(df))*100 # andel svar i undersøkelsen som inneholder personopplysninger
70+
# %%

pyproject.toml

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[tool.poetry]
2+
name = "dataprodukt-toppoppgaver-deling"
3+
version = "0.1.0"
4+
description = ""
5+
authors = ["Tobias McVey <[email protected]>"]
6+
license = "MIT"
7+
readme = "README.md"
8+
packages = [{include = "dataprodukt_toppoppgaver_deling"}]
9+
10+
[tool.poetry.dependencies]
11+
python = "^3.10"
12+
13+
[tool.poetry.group.dev.dependencies]
14+
black = "^22.10.0"
15+
ipykernel = "^6.17.1"
16+
pandas = "^1.5.1"
17+
openpyxl = "^3.0.10"
18+
19+
[build-system]
20+
requires = ["poetry-core"]
21+
build-backend = "poetry.core.masonry.api"

0 commit comments

Comments
 (0)