diff --git a/README.md b/README.md index 3f707fc..1498d97 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Cookiecutter Data Science - Conda +# Cookiecutter Data Science - using poetry and justfiles _A logical, reasonably standardized, but flexible project structure for doing and sharing data science work._ @@ -8,21 +8,13 @@ _A logical, reasonably standardized, but flexible project structure for doing an ### Requirements to use the cookiecutter template: ----------- - - Python 2.7 or 3.5 + - Python 3.9+ - [Cookiecutter Python package](http://cookiecutter.readthedocs.org/en/latest/installation.html) >= 1.4.0: This can be installed with pip by or conda depending on how you manage your Python packages: ``` bash $ pip install cookiecutter ``` -or - -``` bash -$ conda config --add channels conda-forge -$ conda install cookiecutter -``` - - ### To start a new project, run: ------------ @@ -38,7 +30,7 @@ $ conda install cookiecutter The directory structure of your new project looks like this: ``` -├── Makefile <- Makefile with commands like `make data` or `make train` +├── justfile <- justfile with commands like `just data` or `just train` ├── README.md <- The top-level README for developers using this project. ├── data │ ├── interim <- Intermediate data that has been transformed. @@ -47,11 +39,11 @@ The directory structure of your new project looks like this: │ ├── models <- Trained and serialized models, model predictions, or model summaries │ -├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), +├── nbs <- Jupyter notebooks. Naming convention is a number (for ordering), │ the creator's initials, and a short `-` delimited description, e.g. │ `1.0-jqp-initial-data-exploration`. │ -├── requirements <- The requirements directory for reproducing the analysis environment +├── pypoetry.toml <- The requirements directory for reproducing the analysis environment │ ├── src <- Source code for use in this project. │ ├── __init__.py <- Makes src a Python module @@ -79,9 +71,4 @@ We welcome contributions! [See the docs for guidelines](https://drivendata.githu ### Installing development requirements ------------ - pip install -r requirements.txt - -### Running the tests ------------- - - py.test tests + poetry install diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/.gitkeep b/data/.gitkeep similarity index 100% rename from {{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/.gitkeep rename to data/.gitkeep diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 8acbfb2..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,47 +0,0 @@ -import sys -import pytest -import shutil -from pathlib import Path -from cookiecutter import main - -CCDS_ROOT = Path(__file__).parents[1].resolve() - -args = { - 'project_name': 'DrivenData', - 'author_name': 'DrivenData', - 'open_source_license': 'BSD-3-Clause', - 'python_interpreter': 'python' - } - - -def system_check(basename): - platform = sys.platform - if 'linux' in platform: - basename = basename.lower() - return basename - - -@pytest.fixture(scope='class', params=[{}, args]) -def default_baked_project(tmpdir_factory, request): - temp = tmpdir_factory.mktemp('data-project') - out_dir = Path(temp).resolve() - - pytest.param = request.param - main.cookiecutter( - str(CCDS_ROOT), - no_input=True, - extra_context=pytest.param, - output_dir=out_dir - ) - - pn = pytest.param.get('project_name') or 'project_name' - - # project name gets converted to lower case on Linux but not Mac - pn = system_check(pn) - - proj = out_dir / pn - request.cls.path = proj - yield - - # cleanup after - shutil.rmtree(out_dir) \ No newline at end of file diff --git a/tests/test_creation.py b/tests/test_creation.py deleted file mode 100644 index 59bbc75..0000000 --- a/tests/test_creation.py +++ /dev/null @@ -1,119 +0,0 @@ -import os -import pytest -from subprocess import check_output -from conftest import system_check - - -def no_curlies(filepath): - """ Utility to make sure no curly braces appear in a file. - That is, was Jinja able to render everything? - """ - with open(filepath, 'r') as f: - data = f.read() - - template_strings = [ - '{{', - '}}', - '{%', - '%}' - ] - - template_strings_in_file = [s in data for s in template_strings] - return not any(template_strings_in_file) - - -@pytest.mark.usefixtures("default_baked_project") -class TestCookieSetup(object): - def test_project_name(self): - project = self.path - if pytest.param.get('project_name'): - name = system_check('DrivenData') - assert project.name == name - else: - assert project.name == 'project_name' - - def test_author(self): - setup_ = self.path / 'setup.py' - args = ['python', str(setup_), '--author'] - p = check_output(args).decode('ascii').strip() - if pytest.param.get('author_name'): - assert p == 'DrivenData' - else: - assert p == 'Your name (or your organization/company/team)' - - def test_readme(self): - readme_path = self.path / 'README.md' - assert readme_path.exists() - assert no_curlies(readme_path) - # if pytest.param.get('project_name'): - # with open(readme_path) as fin: - # assert 'DrivenData' == next(fin).strip() - - def test_setup(self): - setup_ = self.path / 'setup.py' - args = ['python', str(setup_), '--version'] - p = check_output(args).decode('ascii').strip() - assert p == '0.1.0' - - # def test_license(self): - # license_path = self.path / 'LICENSE' - # assert license_path.exists() - # assert no_curlies(license_path) - - # def test_license_type(self): - # setup_ = self.path / 'setup.py' - # args = ['python', str(setup_), '--license'] - # p = check_output(args).decode('ascii').strip() - # if pytest.param.get('open_source_license'): - # assert p == 'BSD-3' - # else: - # assert p == 'MIT' - - def test_requirements(self): - reqs_path = self.path / 'requirements'/ 'environment.yaml' - assert reqs_path.exists() - assert no_curlies(reqs_path) - # if pytest.param.get('python_interpreter'): - # with open(reqs_path) as fin: - # lines = list(map(lambda x: x.strip(), fin.readlines())) - # assert 'pathlib2' in lines - - def test_makefile(self): - makefile_path = self.path / 'Makefile' - assert makefile_path.exists() - assert no_curlies(makefile_path) - - def test_folders(self): - expected_dirs = [ - 'data', - # 'data/external', - 'data/interim', - 'data/processed', - 'data/raw', - # 'docs', - 'outputs', - 'notebooks', - # 'references', - # 'reports', - # 'reports/figures', - 'drivendata', - 'drivendata/data', - 'drivendata/features', - 'drivendata/models', - 'drivendata/visualization', - ] - - ignored_dirs = [ - str(self.path) - ] - - abs_expected_dirs = [str(self.path / d) for d in expected_dirs] - abs_dirs, _, _ = list(zip(*os.walk(self.path))) - # if pytest.param.get('project_name'): - # print('proj', pytest.param.get('project_name', 'project_name')) - # print(abs_dirs) - # print(set(abs_expected_dirs + ignored_dirs)) - # 1/0 - # print(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) - assert len(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) == 0 - diff --git a/{{ cookiecutter.repo_name }}/.gitignore b/{{ cookiecutter.repo_name }}/.gitignore index d7c9832..83ff7b0 100644 --- a/{{ cookiecutter.repo_name }}/.gitignore +++ b/{{ cookiecutter.repo_name }}/.gitignore @@ -1,3 +1,11 @@ + +# exclude data from source control by default +/data/ +/outputs/ + +# DotEnv configuration +.env + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -56,9 +64,6 @@ docs/_build/ # PyBuilder target/ -# DotEnv configuration -.env - # Database *.db *.rdb @@ -75,9 +80,6 @@ target/ # Jupyter NB Checkpoints .ipynb_checkpoints/ -# exclude data from source control by default -/data/ - # Mac OS-specific storage files .DS_Store diff --git a/{{ cookiecutter.repo_name }}/README.md b/{{ cookiecutter.repo_name }}/README.md index b2228c2..d6ff058 100644 --- a/{{ cookiecutter.repo_name }}/README.md +++ b/{{ cookiecutter.repo_name }}/README.md @@ -2,49 +2,29 @@ {{cookiecutter.description}} -## Project Organization - ├── Justfile <- Makefile with commands like `make data` or `make train` - ├── README.md <- The top-level README for developers using this project. - ├── data - │   ├── 30_processed <- The final, canonical data sets for modeling. - │   ├── 20_interim <- Intermediate data that has been transformed. - │   └── 10_raw <- The original, immutable data dump. - │ - │ - ├── nbs <- Jupyter notebooks. Namiwith creator's initials, a number (for ordering), and short `-` delimited description, e.g. - │ `jqp-1.0-initial-data-exploration`. - │ - ├── pyproject.toml <- defines project dependencies and build configuration - ├── src <- Source code for use in this project. - │   ├── __init__.py <- Makes src a Python module - │ │ - │   ├── data <- Scripts to download or generate data - │   │   └── make_dataset.py - │ │ - │   ├── features <- Scripts to turn raw data into features for modeling - │   │   └── build_features.py - │ │ - │   ├── models <- Scripts to train models and then use trained models to make - │ │ │ predictions - │   │   ├── predict_model.py - │   │   └── train_model.py - │ │ - │   └── visualization <- Scripts to create exploratory and results oriented visualizations -    └── visualize.py +Project status: TODO + +Project plan: + +- [x] Init +- [ ] Fill out README +- [ ] ??? +- [ ] Profit ## Install requirements -This project uses poetry for requirement and is set up for torch using cuda. -``` +This project uses [poetry](https://python-poetry.org/) for requirement and is set up for torch using cuda. +~~~ poetry install -``` +~~~ ## How to get data TODO document how to get the data + ## How to run This project uses [just](https://github.com/casey/just) @@ -53,6 +33,36 @@ This project uses [just](https://github.com/casey/just) just --list ~~~ + +## Project Organization + +Note this project uses + +- [Justfile](https://github.com/casey/just): Command runner with commands like `just data` or `just train` +- data: [data directory ](https://cookiecutter-data-science.drivendata.org/#directory-structure) + - ./10_raw <- The original, immutable data dump. + - ./20_interim <- Intermediate data that has been transformed. + - ./30_processed <- The final, canonical data sets for modeling. +- nbs: upyter notebooks. Name with creator's initials, a number (for ordering), and short `-` delimited description, e.g. `jqp-1.0-initial-data-exploration`. +- pyproject.toml: defines poetry project dependencies and build configuration +- {{cookiecutter.project_name}}: Source code for use in this project. + + +## How to cite + +~~~bibtext +@software{wassname2024{{ cookiecutter.project_name.lower().replace(' ', '_') }}, + author = {Clark, M.J.}, + title = { {{cookiecutter.project_name}} }, + year = {2024}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/wassname/{{ cookiecutter.project_name.lower().replace(' ', '_') }} }, + commit = {} +} +~~~ + + --------

Project based on the cookiecutter data science project template. #cookiecutterdatascience

diff --git a/{{ cookiecutter.repo_name }}/justfile b/{{ cookiecutter.repo_name }}/justfile index b758d30..982e1f3 100644 --- a/{{ cookiecutter.repo_name }}/justfile +++ b/{{ cookiecutter.repo_name }}/justfile @@ -1,4 +1,14 @@ -package := "{{cookiecutter.package_name}}" +# see https://cheatography.com/linux-china/cheat-sheets/justfile/ + +set dotenv-load + +# Export all just variables as environment variables. +set export + +package := "{{cookiecutter.project_name.lower().replace(' ', '_')}}" + +[private] +default: @just --list # put your run commands here app: diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/__init__.py b/{{ cookiecutter.repo_name }}/nbs/.gitkeep similarity index 100% rename from {{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/__init__.py rename to {{ cookiecutter.repo_name }}/nbs/.gitkeep diff --git a/{{ cookiecutter.repo_name }}/nbs/mjc_001_EDA.ipynb b/{{ cookiecutter.repo_name }}/nbs/mjc_001_EDA.ipynb new file mode 100644 index 0000000..e0b258d --- /dev/null +++ b/{{ cookiecutter.repo_name }}/nbs/mjc_001_EDA.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1b44551e", + "metadata": {}, + "source": [ + "# Exploratory Data Analysis\n", + "\n", + "Hypothesis: What is this notebook about?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198de680", + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-28T02:34:01.879987Z", + "start_time": "2022-06-28T02:34:01.864103Z" + } + }, + "outputs": [], + "source": [ + "# autoreload your package\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "import {{ cookiecutter.project_name.lower().replace(' ', '_') }}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a372ed7c", + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-28T02:34:02.470436Z", + "start_time": "2022-06-28T02:34:02.424826Z" + } + }, + "outputs": [], + "source": [ + "## secrets\n", + "from dotenv import load_dotenv\n", + "load_dotenv() # take environment variables from .env.\n", + "\n", + "import warnings\n", + "# warnings.simplefilter(\"ignore\")\n", + "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")\n", + "warnings.filterwarnings(\"ignore\", \".*divide by zero.*\")\n", + "\n", + "## numeric, plotting\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "plt.style.use('ggplot')\n", + "plt.rcParams['figure.figsize'] = (7.0, 4)\n", + "\n", + "## utils\n", + "from pathlib import Path\n", + "from tqdm.auto import tqdm\n", + "import logging, os, re\n", + "import collections, functools, itertools\n", + "\n", + "# torch\n", + "import pytorch_lightning as pl\n", + "from einops import rearrange, repeat, reduce\n", + "import torch\n", + "import torch.nn as nn\n", + "\n", + "# logging\n", + "from loguru import logger\n", + "logger.remove()\n", + "logger.add(os.sys.stdout, level=\"ERROR\", colorize=True, format=\"{time} | {message}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54a03c3a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64890012", + "metadata": { + "ExecuteTime": { + "end_time": "2022-06-28T02:34:02.890216Z", + "start_time": "2022-06-28T02:34:02.882249Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d4da6fa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d102e3d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.4 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/{{ cookiecutter.repo_name }}/pyproject.toml b/{{ cookiecutter.repo_name }}/pyproject.toml index 188ddf1..1feba80 100644 --- a/{{ cookiecutter.repo_name }}/pyproject.toml +++ b/{{ cookiecutter.repo_name }}/pyproject.toml @@ -10,19 +10,19 @@ readme = "README.md" in-project=true [tool.poetry.dependencies] -python = ">=3.10,<3.13" -torch = {version = "^2.1.0+cu124", source = "pytorch"} -simple-parsing = "^0.1.4" -tqdm = "^4.66.1" +python = ">=3.10" numpy = "^1.26.1" pandas = "^2.1.1" -lightning = "^2.1.0" matplotlib = "^3.8.0" -loguru = "^0.7.2" -einops = "^0.7.0" scikit-learn = "^1.3.1" -pytorch-optimizer = "^2.12.0" -torchinfo = "^1.8.0" +loguru = "^0.7.2" +tqdm = "^4.66.1" +# einops = "^0.7.0" +# simple-parsing = "^0.1.4" +# torch = {version = "^2.1.0+cu124", source = "pytorch"} +# lightning = "^2.1.0" +# pytorch-optimizer = "^2.12.0" +# torchinfo = "^1.8.0" # accelerate = "^0.24.1" # transformers = "4.34.0" # accelerate = "^0.24.1" @@ -31,11 +31,11 @@ torchinfo = "^1.8.0" # einops = "^0.3.1" [[tool.poetry.source]] +# pytorch cuda needs to compe from another source https://python-poetry.org/docs/dependency-specification/#source-dependencies name = "pytorch" url = "https://download.pytorch.org/whl/cu124" priority = "explicit" - [tool.poetry.group.dev.dependencies] ipykernel = "^6.25.2" ipywidgets = "^8.1.3" @@ -45,3 +45,7 @@ pylama = "^8.4.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[virtualenvs] +create = true +in-project = true diff --git a/{{ cookiecutter.repo_name }}/research_journal.md b/{{ cookiecutter.repo_name }}/research_journal.md new file mode 100644 index 0000000..cf787de --- /dev/null +++ b/{{ cookiecutter.repo_name }}/research_journal.md @@ -0,0 +1,3 @@ +# 2024-06-09 16:05:45 + +Started project using cookiecutter data science project template. diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/build_features.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/features/build_features.py deleted file mode 100644 index e69de29..0000000 diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/models/predict_model.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/models/predict_model.py deleted file mode 100644 index e69de29..0000000 diff --git a/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/models/train_model.py b/{{ cookiecutter.repo_name }}/{{ cookiecutter.project_name.lower().replace(' ', '_') }}/models/train_model.py deleted file mode 100644 index e69de29..0000000