mirror of
https://github.com/wassname/cookiecutter-data-science.git
synced 2026-06-27 16:45:13 +08:00
Merge branch 'master' into just_poetry
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# Cookiecutter Data Science - Conda
|
||||
# Cookiecutter Data Science - using poetry and justfiles
|
||||
|
||||
_A logical, reasonably standardized, but flexible project structure for doing and sharing data science work._
|
||||
|
||||
@@ -8,21 +8,13 @@ _A logical, reasonably standardized, but flexible project structure for doing an
|
||||
|
||||
### Requirements to use the cookiecutter template:
|
||||
-----------
|
||||
- Python 2.7 or 3.5
|
||||
- Python 3.9+
|
||||
- [Cookiecutter Python package](http://cookiecutter.readthedocs.org/en/latest/installation.html) >= 1.4.0: This can be installed with pip by or conda depending on how you manage your Python packages:
|
||||
|
||||
``` bash
|
||||
$ pip install cookiecutter
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
``` bash
|
||||
$ conda config --add channels conda-forge
|
||||
$ conda install cookiecutter
|
||||
```
|
||||
|
||||
|
||||
### To start a new project, run:
|
||||
------------
|
||||
|
||||
@@ -38,7 +30,7 @@ $ conda install cookiecutter
|
||||
The directory structure of your new project looks like this:
|
||||
|
||||
```
|
||||
├── Makefile <- Makefile with commands like `make data` or `make train`
|
||||
├── justfile <- justfile with commands like `just data` or `just train`
|
||||
├── README.md <- The top-level README for developers using this project.
|
||||
├── data
|
||||
│ ├── interim <- Intermediate data that has been transformed.
|
||||
@@ -47,11 +39,11 @@ The directory structure of your new project looks like this:
|
||||
│
|
||||
├── models <- Trained and serialized models, model predictions, or model summaries
|
||||
│
|
||||
├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
|
||||
├── nbs <- Jupyter notebooks. Naming convention is a number (for ordering),
|
||||
│ the creator's initials, and a short `-` delimited description, e.g.
|
||||
│ `1.0-jqp-initial-data-exploration`.
|
||||
│
|
||||
├── requirements <- The requirements directory for reproducing the analysis environment
|
||||
├── pypoetry.toml <- The requirements directory for reproducing the analysis environment
|
||||
│
|
||||
├── src <- Source code for use in this project.
|
||||
│ ├── __init__.py <- Makes src a Python module
|
||||
@@ -79,9 +71,4 @@ We welcome contributions! [See the docs for guidelines](https://drivendata.githu
|
||||
### Installing development requirements
|
||||
------------
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
### Running the tests
|
||||
------------
|
||||
|
||||
py.test tests
|
||||
poetry install
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
import sys
|
||||
import pytest
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from cookiecutter import main
|
||||
|
||||
CCDS_ROOT = Path(__file__).parents[1].resolve()
|
||||
|
||||
args = {
|
||||
'project_name': 'DrivenData',
|
||||
'author_name': 'DrivenData',
|
||||
'open_source_license': 'BSD-3-Clause',
|
||||
'python_interpreter': 'python'
|
||||
}
|
||||
|
||||
|
||||
def system_check(basename):
|
||||
platform = sys.platform
|
||||
if 'linux' in platform:
|
||||
basename = basename.lower()
|
||||
return basename
|
||||
|
||||
|
||||
@pytest.fixture(scope='class', params=[{}, args])
|
||||
def default_baked_project(tmpdir_factory, request):
|
||||
temp = tmpdir_factory.mktemp('data-project')
|
||||
out_dir = Path(temp).resolve()
|
||||
|
||||
pytest.param = request.param
|
||||
main.cookiecutter(
|
||||
str(CCDS_ROOT),
|
||||
no_input=True,
|
||||
extra_context=pytest.param,
|
||||
output_dir=out_dir
|
||||
)
|
||||
|
||||
pn = pytest.param.get('project_name') or 'project_name'
|
||||
|
||||
# project name gets converted to lower case on Linux but not Mac
|
||||
pn = system_check(pn)
|
||||
|
||||
proj = out_dir / pn
|
||||
request.cls.path = proj
|
||||
yield
|
||||
|
||||
# cleanup after
|
||||
shutil.rmtree(out_dir)
|
||||
@@ -1,119 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from subprocess import check_output
|
||||
from conftest import system_check
|
||||
|
||||
|
||||
def no_curlies(filepath):
|
||||
""" Utility to make sure no curly braces appear in a file.
|
||||
That is, was Jinja able to render everything?
|
||||
"""
|
||||
with open(filepath, 'r') as f:
|
||||
data = f.read()
|
||||
|
||||
template_strings = [
|
||||
'{{',
|
||||
'}}',
|
||||
'{%',
|
||||
'%}'
|
||||
]
|
||||
|
||||
template_strings_in_file = [s in data for s in template_strings]
|
||||
return not any(template_strings_in_file)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("default_baked_project")
|
||||
class TestCookieSetup(object):
|
||||
def test_project_name(self):
|
||||
project = self.path
|
||||
if pytest.param.get('project_name'):
|
||||
name = system_check('DrivenData')
|
||||
assert project.name == name
|
||||
else:
|
||||
assert project.name == 'project_name'
|
||||
|
||||
def test_author(self):
|
||||
setup_ = self.path / 'setup.py'
|
||||
args = ['python', str(setup_), '--author']
|
||||
p = check_output(args).decode('ascii').strip()
|
||||
if pytest.param.get('author_name'):
|
||||
assert p == 'DrivenData'
|
||||
else:
|
||||
assert p == 'Your name (or your organization/company/team)'
|
||||
|
||||
def test_readme(self):
|
||||
readme_path = self.path / 'README.md'
|
||||
assert readme_path.exists()
|
||||
assert no_curlies(readme_path)
|
||||
# if pytest.param.get('project_name'):
|
||||
# with open(readme_path) as fin:
|
||||
# assert 'DrivenData' == next(fin).strip()
|
||||
|
||||
def test_setup(self):
|
||||
setup_ = self.path / 'setup.py'
|
||||
args = ['python', str(setup_), '--version']
|
||||
p = check_output(args).decode('ascii').strip()
|
||||
assert p == '0.1.0'
|
||||
|
||||
# def test_license(self):
|
||||
# license_path = self.path / 'LICENSE'
|
||||
# assert license_path.exists()
|
||||
# assert no_curlies(license_path)
|
||||
|
||||
# def test_license_type(self):
|
||||
# setup_ = self.path / 'setup.py'
|
||||
# args = ['python', str(setup_), '--license']
|
||||
# p = check_output(args).decode('ascii').strip()
|
||||
# if pytest.param.get('open_source_license'):
|
||||
# assert p == 'BSD-3'
|
||||
# else:
|
||||
# assert p == 'MIT'
|
||||
|
||||
def test_requirements(self):
|
||||
reqs_path = self.path / 'requirements'/ 'environment.yaml'
|
||||
assert reqs_path.exists()
|
||||
assert no_curlies(reqs_path)
|
||||
# if pytest.param.get('python_interpreter'):
|
||||
# with open(reqs_path) as fin:
|
||||
# lines = list(map(lambda x: x.strip(), fin.readlines()))
|
||||
# assert 'pathlib2' in lines
|
||||
|
||||
def test_makefile(self):
|
||||
makefile_path = self.path / 'Makefile'
|
||||
assert makefile_path.exists()
|
||||
assert no_curlies(makefile_path)
|
||||
|
||||
def test_folders(self):
|
||||
expected_dirs = [
|
||||
'data',
|
||||
# 'data/external',
|
||||
'data/interim',
|
||||
'data/processed',
|
||||
'data/raw',
|
||||
# 'docs',
|
||||
'outputs',
|
||||
'notebooks',
|
||||
# 'references',
|
||||
# 'reports',
|
||||
# 'reports/figures',
|
||||
'drivendata',
|
||||
'drivendata/data',
|
||||
'drivendata/features',
|
||||
'drivendata/models',
|
||||
'drivendata/visualization',
|
||||
]
|
||||
|
||||
ignored_dirs = [
|
||||
str(self.path)
|
||||
]
|
||||
|
||||
abs_expected_dirs = [str(self.path / d) for d in expected_dirs]
|
||||
abs_dirs, _, _ = list(zip(*os.walk(self.path)))
|
||||
# if pytest.param.get('project_name'):
|
||||
# print('proj', pytest.param.get('project_name', 'project_name'))
|
||||
# print(abs_dirs)
|
||||
# print(set(abs_expected_dirs + ignored_dirs))
|
||||
# 1/0
|
||||
# print(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs))
|
||||
assert len(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) == 0
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
|
||||
# exclude data from source control by default
|
||||
/data/
|
||||
/outputs/
|
||||
|
||||
# DotEnv configuration
|
||||
.env
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
@@ -56,9 +64,6 @@ docs/_build/
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# DotEnv configuration
|
||||
.env
|
||||
|
||||
# Database
|
||||
*.db
|
||||
*.rdb
|
||||
@@ -75,9 +80,6 @@ target/
|
||||
# Jupyter NB Checkpoints
|
||||
.ipynb_checkpoints/
|
||||
|
||||
# exclude data from source control by default
|
||||
/data/
|
||||
|
||||
# Mac OS-specific storage files
|
||||
.DS_Store
|
||||
|
||||
|
||||
@@ -2,49 +2,29 @@
|
||||
|
||||
{{cookiecutter.description}}
|
||||
|
||||
## Project Organization
|
||||
|
||||
├── Justfile <- Makefile with commands like `make data` or `make train`
|
||||
├── README.md <- The top-level README for developers using this project.
|
||||
├── data
|
||||
│ ├── 30_processed <- The final, canonical data sets for modeling.
|
||||
│ ├── 20_interim <- Intermediate data that has been transformed.
|
||||
│ └── 10_raw <- The original, immutable data dump.
|
||||
│
|
||||
│
|
||||
├── nbs <- Jupyter notebooks. Namiwith creator's initials, a number (for ordering), and short `-` delimited description, e.g.
|
||||
│ `jqp-1.0-initial-data-exploration`.
|
||||
│
|
||||
├── pyproject.toml <- defines project dependencies and build configuration
|
||||
├── src <- Source code for use in this project.
|
||||
│ ├── __init__.py <- Makes src a Python module
|
||||
│ │
|
||||
│ ├── data <- Scripts to download or generate data
|
||||
│ │ └── make_dataset.py
|
||||
│ │
|
||||
│ ├── features <- Scripts to turn raw data into features for modeling
|
||||
│ │ └── build_features.py
|
||||
│ │
|
||||
│ ├── models <- Scripts to train models and then use trained models to make
|
||||
│ │ │ predictions
|
||||
│ │ ├── predict_model.py
|
||||
│ │ └── train_model.py
|
||||
│ │
|
||||
│ └── visualization <- Scripts to create exploratory and results oriented visualizations
|
||||
└── visualize.py
|
||||
Project status: TODO
|
||||
|
||||
Project plan:
|
||||
|
||||
- [x] Init
|
||||
- [ ] Fill out README
|
||||
- [ ] ???
|
||||
- [ ] Profit
|
||||
|
||||
|
||||
## Install requirements
|
||||
|
||||
This project uses poetry for requirement and is set up for torch using cuda.
|
||||
```
|
||||
This project uses [poetry](https://python-poetry.org/) for requirement and is set up for torch using cuda.
|
||||
~~~
|
||||
poetry install
|
||||
```
|
||||
~~~
|
||||
|
||||
## How to get data
|
||||
|
||||
TODO document how to get the data
|
||||
|
||||
|
||||
## How to run
|
||||
|
||||
This project uses [just](https://github.com/casey/just)
|
||||
@@ -53,6 +33,36 @@ This project uses [just](https://github.com/casey/just)
|
||||
just --list
|
||||
~~~
|
||||
|
||||
|
||||
## Project Organization
|
||||
|
||||
Note this project uses
|
||||
|
||||
- [Justfile](https://github.com/casey/just): Command runner with commands like `just data` or `just train`
|
||||
- data: [data directory ](https://cookiecutter-data-science.drivendata.org/#directory-structure)
|
||||
- ./10_raw <- The original, immutable data dump.
|
||||
- ./20_interim <- Intermediate data that has been transformed.
|
||||
- ./30_processed <- The final, canonical data sets for modeling.
|
||||
- nbs: upyter notebooks. Name with creator's initials, a number (for ordering), and short `-` delimited description, e.g. `jqp-1.0-initial-data-exploration`.
|
||||
- pyproject.toml: defines poetry project dependencies and build configuration
|
||||
- {{cookiecutter.project_name}}: Source code for use in this project.
|
||||
|
||||
|
||||
## How to cite
|
||||
|
||||
~~~bibtext
|
||||
@software{wassname2024{{ cookiecutter.project_name.lower().replace(' ', '_') }},
|
||||
author = {Clark, M.J.},
|
||||
title = { {{cookiecutter.project_name}} },
|
||||
year = {2024},
|
||||
publisher = {GitHub},
|
||||
journal = {GitHub repository},
|
||||
url = {https://github.com/wassname/{{ cookiecutter.project_name.lower().replace(' ', '_') }} },
|
||||
commit = {<commit hash>}
|
||||
}
|
||||
~~~
|
||||
|
||||
|
||||
--------
|
||||
|
||||
<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
|
||||
|
||||
@@ -1,4 +1,14 @@
|
||||
package := "{{cookiecutter.package_name}}"
|
||||
# see https://cheatography.com/linux-china/cheat-sheets/justfile/
|
||||
|
||||
set dotenv-load
|
||||
|
||||
# Export all just variables as environment variables.
|
||||
set export
|
||||
|
||||
package := "{{cookiecutter.project_name.lower().replace(' ', '_')}}"
|
||||
|
||||
[private]
|
||||
default: @just --list
|
||||
|
||||
# put your run commands here
|
||||
app:
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1b44551e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Exploratory Data Analysis\n",
|
||||
"\n",
|
||||
"Hypothesis: What is this notebook about?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "198de680",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2022-06-28T02:34:01.879987Z",
|
||||
"start_time": "2022-06-28T02:34:01.864103Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# autoreload your package\n",
|
||||
"%load_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"import {{ cookiecutter.project_name.lower().replace(' ', '_') }}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a372ed7c",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2022-06-28T02:34:02.470436Z",
|
||||
"start_time": "2022-06-28T02:34:02.424826Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## secrets\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"load_dotenv() # take environment variables from .env.\n",
|
||||
"\n",
|
||||
"import warnings\n",
|
||||
"# warnings.simplefilter(\"ignore\")\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")\n",
|
||||
"warnings.filterwarnings(\"ignore\", \".*divide by zero.*\")\n",
|
||||
"\n",
|
||||
"## numeric, plotting\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"%matplotlib inline\n",
|
||||
"plt.style.use('ggplot')\n",
|
||||
"plt.rcParams['figure.figsize'] = (7.0, 4)\n",
|
||||
"\n",
|
||||
"## utils\n",
|
||||
"from pathlib import Path\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"import logging, os, re\n",
|
||||
"import collections, functools, itertools\n",
|
||||
"\n",
|
||||
"# torch\n",
|
||||
"import pytorch_lightning as pl\n",
|
||||
"from einops import rearrange, repeat, reduce\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"\n",
|
||||
"# logging\n",
|
||||
"from loguru import logger\n",
|
||||
"logger.remove()\n",
|
||||
"logger.add(os.sys.stdout, level=\"ERROR\", colorize=True, format=\"<level>{time} | {message}</level>\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "54a03c3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "64890012",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2022-06-28T02:34:02.890216Z",
|
||||
"start_time": "2022-06-28T02:34:02.882249Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1d4da6fa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d102e3d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.10.4 64-bit",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -10,19 +10,19 @@ readme = "README.md"
|
||||
in-project=true
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.10,<3.13"
|
||||
torch = {version = "^2.1.0+cu124", source = "pytorch"}
|
||||
simple-parsing = "^0.1.4"
|
||||
tqdm = "^4.66.1"
|
||||
python = ">=3.10"
|
||||
numpy = "^1.26.1"
|
||||
pandas = "^2.1.1"
|
||||
lightning = "^2.1.0"
|
||||
matplotlib = "^3.8.0"
|
||||
loguru = "^0.7.2"
|
||||
einops = "^0.7.0"
|
||||
scikit-learn = "^1.3.1"
|
||||
pytorch-optimizer = "^2.12.0"
|
||||
torchinfo = "^1.8.0"
|
||||
loguru = "^0.7.2"
|
||||
tqdm = "^4.66.1"
|
||||
# einops = "^0.7.0"
|
||||
# simple-parsing = "^0.1.4"
|
||||
# torch = {version = "^2.1.0+cu124", source = "pytorch"}
|
||||
# lightning = "^2.1.0"
|
||||
# pytorch-optimizer = "^2.12.0"
|
||||
# torchinfo = "^1.8.0"
|
||||
# accelerate = "^0.24.1"
|
||||
# transformers = "4.34.0"
|
||||
# accelerate = "^0.24.1"
|
||||
@@ -31,11 +31,11 @@ torchinfo = "^1.8.0"
|
||||
# einops = "^0.3.1"
|
||||
|
||||
[[tool.poetry.source]]
|
||||
# pytorch cuda needs to compe from another source https://python-poetry.org/docs/dependency-specification/#source-dependencies
|
||||
name = "pytorch"
|
||||
url = "https://download.pytorch.org/whl/cu124"
|
||||
priority = "explicit"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipykernel = "^6.25.2"
|
||||
ipywidgets = "^8.1.3"
|
||||
@@ -45,3 +45,7 @@ pylama = "^8.4.1"
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[virtualenvs]
|
||||
create = true
|
||||
in-project = true
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
# 2024-06-09 16:05:45
|
||||
|
||||
Started project using cookiecutter data science project template.
|
||||
Reference in New Issue
Block a user