diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..c93d267 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,38 @@ +{ + "name": "Transformation - Python", + "image": "mcr.microsoft.com/devcontainers/base:1-debian", + "features": { + "ghcr.io/devcontainers/features/java:1": { + "version": "17", + "jdkDistro": "open", + "gradleVersion": "latest", + "mavenVersion": "latest", + "antVersion": "latest", + "groovyVersion": "latest" + }, + "ghcr.io/devcontainers/features/python:1": { + "version": "3.13" + }, + "ghcr.io/devcontainers-extra/features/poetry:2": { + "version": "latest" + } + }, + "containerEnv": { + "PYTHONUNBUFFERED": "1" + }, + "postCreateCommand": "poetry install", + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-python.vscode-pylance", + "charliermarsh.ruff", + "ms-toolsai.jupyter" + ], + "settings": { + "python.testing.pytestEnabled": true, + "python.testing.pytestArgs": ["tests"] + } + } + } +} diff --git a/README.md b/README.md index cef09eb..a82f5d7 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ These jobs are using _PySpark_ to process larger volumes of data and are suppose ### Local Setup +> 💡 Use the [Devcontainer setup](#devcontainer-setup) if you encounter issues. + #### Pre-requisites Please make sure you have the following installed and can run them @@ -32,12 +34,31 @@ We recommend using WSL 2 on Windows for this exercise, due to the [lack of suppo Follow instructions on the [Windows official page](https://learn.microsoft.com/en-us/windows/wsl/setup/environment) and then the linux install. +> 💡 Use the [Devcontainer setup](#devcontainer-setup) if you encounter issues. + #### Install all dependencies ```bash poetry install ``` +### Devcontainer setup + +Configuration to use dev containers is provided in `.devcontainer` + +> ⚠️ this take up to 7 minutes to setup, make sure to have things running before the interview. + +### In Github codespaces + +1. [Fork](https://github.com/techops-recsys-lateral-hiring/dataengineer-transformations-python/fork) this repository. +2. Follow [codespace instructions](https://docs.github.com/en/codespaces/developing-in-a-codespace/creating-a-codespace-for-a-repository#the-codespace-creation-process) from the forked repository, to create the environment. + +#### In VSCode - Alternative + +This requires a working local docker setup matching your OS and licensing situation, and [VSCode](https://code.visualstudio.com/download). + +If you have all of these, follow instructions in https://code.visualstudio.com/docs/devcontainers/containers. Otherwise, consider using codespaces. + ### Verify setup > All of the following commands should be running successfully @@ -87,6 +108,7 @@ The following section provides context over them. ``` / +├─ /.devcontainer # Contains configurations for dev containers ├─ /data_transformations # Contains the main python library │ # with the code to the transformations │ @@ -102,7 +124,6 @@ The following section provides context over them. │ # and the setup │ ├─ .gitignore -├─ .pylintrc # configuration for pylint ├─ LICENCE ├─ poetry.lock ├─ pyproject.toml diff --git a/tests/integration/test_validate_spark_environment.py b/tests/integration/test_validate_spark_environment.py index 62e02b2..5828d26 100644 --- a/tests/integration/test_validate_spark_environment.py +++ b/tests/integration/test_validate_spark_environment.py @@ -34,13 +34,15 @@ def __extract_version_line(java_version_output: str) -> str: (line for line in java_version_output.splitlines() if "version" in line), None ) if not version_line: - pytest.fail("Couldn't find version information in `java -version` output.") + pytest.fail( + "Couldn't find version information in `java -version` output.") return version_line # pylint: disable=R1710 def __parse_major_version(version_line: str) -> int: - version_regex = re.compile(r'version "(?P\d+)\.(?P\d+)\.\w+"') + version_regex = re.compile( + r'version "(?P\d+)\.(?P\d+)\.\w+"') match = version_regex.search(version_line) if match is not None: major_version = int(match.group("major")) @@ -49,4 +51,11 @@ def __parse_major_version(version_line: str) -> int: # https://softwareengineering.stackexchange.com/questions/175075/why-is-java-version-1-x-referred-to-as-java-x major_version = int(match.group("minor")) return major_version + + # Opensource versions follow an alternative system + alternative_version_regex = re.compile(r'version "(?P\d+)"') + match = alternative_version_regex.search(version_line) + if match is not None: + major_version = int(match.group("major")) + return major_version pytest.fail(f"Couldn't parse Java version from {version_line}.")