Merge branch 'master' into just_poetry

This commit is contained in:
Michael J Clark
2024-09-02 14:23:35 +08:00
committed by GitHub
14 changed files with 239 additions and 234 deletions
+6 -19
View File
@@ -1,4 +1,4 @@
# Cookiecutter Data Science - Conda
# Cookiecutter Data Science - using poetry and justfiles
_A logical, reasonably standardized, but flexible project structure for doing and sharing data science work._
@@ -8,21 +8,13 @@ _A logical, reasonably standardized, but flexible project structure for doing an
### Requirements to use the cookiecutter template:
-----------
- Python 2.7 or 3.5
- Python 3.9+
- [Cookiecutter Python package](http://cookiecutter.readthedocs.org/en/latest/installation.html) >= 1.4.0: This can be installed with pip by or conda depending on how you manage your Python packages:
``` bash
$ pip install cookiecutter
```
or
``` bash
$ conda config --add channels conda-forge
$ conda install cookiecutter
```
### To start a new project, run:
------------
@@ -38,7 +30,7 @@ $ conda install cookiecutter
The directory structure of your new project looks like this:
```
├── Makefile <- Makefile with commands like `make data` or `make train`
├── justfile <- justfile with commands like `just data` or `just train`
├── README.md <- The top-level README for developers using this project.
├── data
│ ├── interim <- Intermediate data that has been transformed.
@@ -47,11 +39,11 @@ The directory structure of your new project looks like this:
├── models <- Trained and serialized models, model predictions, or model summaries
├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
├── nbs <- Jupyter notebooks. Naming convention is a number (for ordering),
│ the creator's initials, and a short `-` delimited description, e.g.
│ `1.0-jqp-initial-data-exploration`.
├── requirements <- The requirements directory for reproducing the analysis environment
├── pypoetry.toml <- The requirements directory for reproducing the analysis environment
├── src <- Source code for use in this project.
│ ├── __init__.py <- Makes src a Python module
@@ -79,9 +71,4 @@ We welcome contributions! [See the docs for guidelines](https://drivendata.githu
### Installing development requirements
------------
pip install -r requirements.txt
### Running the tests
------------
py.test tests
poetry install
-47
View File
@@ -1,47 +0,0 @@
import sys
import pytest
import shutil
from pathlib import Path
from cookiecutter import main
CCDS_ROOT = Path(__file__).parents[1].resolve()
args = {
'project_name': 'DrivenData',
'author_name': 'DrivenData',
'open_source_license': 'BSD-3-Clause',
'python_interpreter': 'python'
}
def system_check(basename):
platform = sys.platform
if 'linux' in platform:
basename = basename.lower()
return basename
@pytest.fixture(scope='class', params=[{}, args])
def default_baked_project(tmpdir_factory, request):
temp = tmpdir_factory.mktemp('data-project')
out_dir = Path(temp).resolve()
pytest.param = request.param
main.cookiecutter(
str(CCDS_ROOT),
no_input=True,
extra_context=pytest.param,
output_dir=out_dir
)
pn = pytest.param.get('project_name') or 'project_name'
# project name gets converted to lower case on Linux but not Mac
pn = system_check(pn)
proj = out_dir / pn
request.cls.path = proj
yield
# cleanup after
shutil.rmtree(out_dir)
-119
View File
@@ -1,119 +0,0 @@
import os
import pytest
from subprocess import check_output
from conftest import system_check
def no_curlies(filepath):
""" Utility to make sure no curly braces appear in a file.
That is, was Jinja able to render everything?
"""
with open(filepath, 'r') as f:
data = f.read()
template_strings = [
'{{',
'}}',
'{%',
'%}'
]
template_strings_in_file = [s in data for s in template_strings]
return not any(template_strings_in_file)
@pytest.mark.usefixtures("default_baked_project")
class TestCookieSetup(object):
def test_project_name(self):
project = self.path
if pytest.param.get('project_name'):
name = system_check('DrivenData')
assert project.name == name
else:
assert project.name == 'project_name'
def test_author(self):
setup_ = self.path / 'setup.py'
args = ['python', str(setup_), '--author']
p = check_output(args).decode('ascii').strip()
if pytest.param.get('author_name'):
assert p == 'DrivenData'
else:
assert p == 'Your name (or your organization/company/team)'
def test_readme(self):
readme_path = self.path / 'README.md'
assert readme_path.exists()
assert no_curlies(readme_path)
# if pytest.param.get('project_name'):
# with open(readme_path) as fin:
# assert 'DrivenData' == next(fin).strip()
def test_setup(self):
setup_ = self.path / 'setup.py'
args = ['python', str(setup_), '--version']
p = check_output(args).decode('ascii').strip()
assert p == '0.1.0'
# def test_license(self):
# license_path = self.path / 'LICENSE'
# assert license_path.exists()
# assert no_curlies(license_path)
# def test_license_type(self):
# setup_ = self.path / 'setup.py'
# args = ['python', str(setup_), '--license']
# p = check_output(args).decode('ascii').strip()
# if pytest.param.get('open_source_license'):
# assert p == 'BSD-3'
# else:
# assert p == 'MIT'
def test_requirements(self):
reqs_path = self.path / 'requirements'/ 'environment.yaml'
assert reqs_path.exists()
assert no_curlies(reqs_path)
# if pytest.param.get('python_interpreter'):
# with open(reqs_path) as fin:
# lines = list(map(lambda x: x.strip(), fin.readlines()))
# assert 'pathlib2' in lines
def test_makefile(self):
makefile_path = self.path / 'Makefile'
assert makefile_path.exists()
assert no_curlies(makefile_path)
def test_folders(self):
expected_dirs = [
'data',
# 'data/external',
'data/interim',
'data/processed',
'data/raw',
# 'docs',
'outputs',
'notebooks',
# 'references',
# 'reports',
# 'reports/figures',
'drivendata',
'drivendata/data',
'drivendata/features',
'drivendata/models',
'drivendata/visualization',
]
ignored_dirs = [
str(self.path)
]
abs_expected_dirs = [str(self.path / d) for d in expected_dirs]
abs_dirs, _, _ = list(zip(*os.walk(self.path)))
# if pytest.param.get('project_name'):
# print('proj', pytest.param.get('project_name', 'project_name'))
# print(abs_dirs)
# print(set(abs_expected_dirs + ignored_dirs))
# 1/0
# print(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs))
assert len(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) == 0
+8 -6
View File
@@ -1,3 +1,11 @@
# exclude data from source control by default
/data/
/outputs/
# DotEnv configuration
.env
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@@ -56,9 +64,6 @@ docs/_build/
# PyBuilder
target/
# DotEnv configuration
.env
# Database
*.db
*.rdb
@@ -75,9 +80,6 @@ target/
# Jupyter NB Checkpoints
.ipynb_checkpoints/
# exclude data from source control by default
/data/
# Mac OS-specific storage files
.DS_Store
+42 -32
View File
@@ -2,49 +2,29 @@
{{cookiecutter.description}}
## Project Organization
├── Justfile <- Makefile with commands like `make data` or `make train`
├── README.md <- The top-level README for developers using this project.
├── data
│   ├── 30_processed <- The final, canonical data sets for modeling.
│   ├── 20_interim <- Intermediate data that has been transformed.
│   └── 10_raw <- The original, immutable data dump.
├── nbs <- Jupyter notebooks. Namiwith creator's initials, a number (for ordering), and short `-` delimited description, e.g.
`jqp-1.0-initial-data-exploration`.
├── pyproject.toml <- defines project dependencies and build configuration
├── src <- Source code for use in this project.
│   ├── __init__.py <- Makes src a Python module
│ │
│   ├── data <- Scripts to download or generate data
│   │   └── make_dataset.py
│ │
│   ├── features <- Scripts to turn raw data into features for modeling
│   │   └── build_features.py
│ │
│   ├── models <- Scripts to train models and then use trained models to make
│ │ │ predictions
│   │   ├── predict_model.py
│   │   └── train_model.py
│ │
│   └── visualization <- Scripts to create exploratory and results oriented visualizations
   └── visualize.py
Project status: TODO
Project plan:
- [x] Init
- [ ] Fill out README
- [ ] ???
- [ ] Profit
## Install requirements
This project uses poetry for requirement and is set up for torch using cuda.
```
This project uses [poetry](https://python-poetry.org/) for requirement and is set up for torch using cuda.
~~~
poetry install
```
~~~
## How to get data
TODO document how to get the data
## How to run
This project uses [just](https://github.com/casey/just)
@@ -53,6 +33,36 @@ This project uses [just](https://github.com/casey/just)
just --list
~~~
## Project Organization
Note this project uses
- [Justfile](https://github.com/casey/just): Command runner with commands like `just data` or `just train`
- data: [data directory ](https://cookiecutter-data-science.drivendata.org/#directory-structure)
- ./10_raw <- The original, immutable data dump.
- ./20_interim <- Intermediate data that has been transformed.
- ./30_processed <- The final, canonical data sets for modeling.
- nbs: upyter notebooks. Name with creator's initials, a number (for ordering), and short `-` delimited description, e.g. `jqp-1.0-initial-data-exploration`.
- pyproject.toml: defines poetry project dependencies and build configuration
- {{cookiecutter.project_name}}: Source code for use in this project.
## How to cite
~~~bibtext
@software{wassname2024{{ cookiecutter.project_name.lower().replace(' ', '_') }},
author = {Clark, M.J.},
title = { {{cookiecutter.project_name}} },
year = {2024},
publisher = {GitHub},
journal = {GitHub repository},
url = {https://github.com/wassname/{{ cookiecutter.project_name.lower().replace(' ', '_') }} },
commit = {<commit hash>}
}
~~~
--------
<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
+11 -1
View File
@@ -1,4 +1,14 @@
package := "{{cookiecutter.package_name}}"
# see https://cheatography.com/linux-china/cheat-sheets/justfile/
set dotenv-load
# Export all just variables as environment variables.
set export
package := "{{cookiecutter.project_name.lower().replace(' ', '_')}}"
[private]
default: @just --list
# put your run commands here
app:
@@ -0,0 +1,155 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1b44551e",
"metadata": {},
"source": [
"# Exploratory Data Analysis\n",
"\n",
"Hypothesis: What is this notebook about?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "198de680",
"metadata": {
"ExecuteTime": {
"end_time": "2022-06-28T02:34:01.879987Z",
"start_time": "2022-06-28T02:34:01.864103Z"
}
},
"outputs": [],
"source": [
"# autoreload your package\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"import {{ cookiecutter.project_name.lower().replace(' ', '_') }}\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a372ed7c",
"metadata": {
"ExecuteTime": {
"end_time": "2022-06-28T02:34:02.470436Z",
"start_time": "2022-06-28T02:34:02.424826Z"
}
},
"outputs": [],
"source": [
"## secrets\n",
"from dotenv import load_dotenv\n",
"load_dotenv() # take environment variables from .env.\n",
"\n",
"import warnings\n",
"# warnings.simplefilter(\"ignore\")\n",
"warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")\n",
"warnings.filterwarnings(\"ignore\", \".*divide by zero.*\")\n",
"\n",
"## numeric, plotting\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"plt.style.use('ggplot')\n",
"plt.rcParams['figure.figsize'] = (7.0, 4)\n",
"\n",
"## utils\n",
"from pathlib import Path\n",
"from tqdm.auto import tqdm\n",
"import logging, os, re\n",
"import collections, functools, itertools\n",
"\n",
"# torch\n",
"import pytorch_lightning as pl\n",
"from einops import rearrange, repeat, reduce\n",
"import torch\n",
"import torch.nn as nn\n",
"\n",
"# logging\n",
"from loguru import logger\n",
"logger.remove()\n",
"logger.add(os.sys.stdout, level=\"ERROR\", colorize=True, format=\"<level>{time} | {message}</level>\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54a03c3a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "64890012",
"metadata": {
"ExecuteTime": {
"end_time": "2022-06-28T02:34:02.890216Z",
"start_time": "2022-06-28T02:34:02.882249Z"
}
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d4da6fa",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d102e3d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+14 -10
View File
@@ -10,19 +10,19 @@ readme = "README.md"
in-project=true
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
torch = {version = "^2.1.0+cu124", source = "pytorch"}
simple-parsing = "^0.1.4"
tqdm = "^4.66.1"
python = ">=3.10"
numpy = "^1.26.1"
pandas = "^2.1.1"
lightning = "^2.1.0"
matplotlib = "^3.8.0"
loguru = "^0.7.2"
einops = "^0.7.0"
scikit-learn = "^1.3.1"
pytorch-optimizer = "^2.12.0"
torchinfo = "^1.8.0"
loguru = "^0.7.2"
tqdm = "^4.66.1"
# einops = "^0.7.0"
# simple-parsing = "^0.1.4"
# torch = {version = "^2.1.0+cu124", source = "pytorch"}
# lightning = "^2.1.0"
# pytorch-optimizer = "^2.12.0"
# torchinfo = "^1.8.0"
# accelerate = "^0.24.1"
# transformers = "4.34.0"
# accelerate = "^0.24.1"
@@ -31,11 +31,11 @@ torchinfo = "^1.8.0"
# einops = "^0.3.1"
[[tool.poetry.source]]
# pytorch cuda needs to compe from another source https://python-poetry.org/docs/dependency-specification/#source-dependencies
name = "pytorch"
url = "https://download.pytorch.org/whl/cu124"
priority = "explicit"
[tool.poetry.group.dev.dependencies]
ipykernel = "^6.25.2"
ipywidgets = "^8.1.3"
@@ -45,3 +45,7 @@ pylama = "^8.4.1"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[virtualenvs]
create = true
in-project = true
@@ -0,0 +1,3 @@
# 2024-06-09 16:05:45
Started project using cookiecutter data science project template.