Merge branch 'master' into just_poetry

2026-06-27 16:45:13 +08:00 · 2024-09-02 14:23:35 +08:00
parent cfc1f2fc5c 46c361d012
commit 9995c3613c
14 changed files with 239 additions and 234 deletions
@@ -1,4 +1,4 @@
-# Cookiecutter Data Science - Conda
+# Cookiecutter Data Science - using poetry and justfiles

 _A logical, reasonably standardized, but flexible project structure for doing and sharing data science work._

@@ -8,21 +8,13 @@ _A logical, reasonably standardized, but flexible project structure for doing an

 ### Requirements to use the cookiecutter template:
 -----------
- - Python 2.7 or 3.5
+ - Python 3.9+
 - [Cookiecutter Python package](http://cookiecutter.readthedocs.org/en/latest/installation.html) >= 1.4.0: This can be installed with pip by or conda depending on how you manage your Python packages:

 ``` bash
 $ pip install cookiecutter
 ```

-or
-
-``` bash
-$ conda config --add channels conda-forge
-$ conda install cookiecutter
-```
-
-
 ### To start a new project, run:
 ------------

@@ -38,7 +30,7 @@ $ conda install cookiecutter
 The directory structure of your new project looks like this: 

 ```
-├── Makefile           <- Makefile with commands like `make data` or `make train`
+├── justfile           <- justfile with commands like `just data` or `just train`
 ├── README.md          <- The top-level README for developers using this project.
 ├── data
 │   ├── interim        <- Intermediate data that has been transformed.
@@ -47,11 +39,11 @@ The directory structure of your new project looks like this:
 │
 ├── models             <- Trained and serialized models, model predictions, or model summaries
 │
-├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
+├── nbs          <- Jupyter notebooks. Naming convention is a number (for ordering),
 │                         the creator's initials, and a short `-` delimited description, e.g.
 │                         `1.0-jqp-initial-data-exploration`.
 │
-├── requirements   <- The requirements directory for reproducing the analysis environment
+├── pypoetry.toml   <- The requirements directory for reproducing the analysis environment
 │
 ├── src                <- Source code for use in this project.
 │   ├── __init__.py    <- Makes src a Python module
@@ -79,9 +71,4 @@ We welcome contributions! [See the docs for guidelines](https://drivendata.githu
 ### Installing development requirements
 ------------

-    pip install -r requirements.txt
-
-### Running the tests
------------
-
-    py.test tests
+    poetry install
@@ -1,47 +0,0 @@
-import sys
-import pytest
-import shutil
-from pathlib import Path
-from cookiecutter import main
-
-CCDS_ROOT = Path(__file__).parents[1].resolve()
-
-args = {
-        'project_name': 'DrivenData',
-        'author_name': 'DrivenData',
-        'open_source_license': 'BSD-3-Clause',
-        'python_interpreter': 'python'
-        }
-
-
-def system_check(basename):
-    platform = sys.platform
-    if 'linux' in platform:
-        basename = basename.lower()
-    return basename
-
-
-@pytest.fixture(scope='class', params=[{}, args])
-def default_baked_project(tmpdir_factory, request):
-    temp = tmpdir_factory.mktemp('data-project')
-    out_dir = Path(temp).resolve()
-
-    pytest.param = request.param
-    main.cookiecutter(
-        str(CCDS_ROOT),
-        no_input=True,
-        extra_context=pytest.param,
-        output_dir=out_dir
-    )
-
-    pn = pytest.param.get('project_name') or 'project_name'
-    
-    # project name gets converted to lower case on Linux but not Mac
-    pn = system_check(pn)
-
-    proj = out_dir / pn
-    request.cls.path = proj
-    yield 
-
-    # cleanup after
-    shutil.rmtree(out_dir)
@@ -1,119 +0,0 @@
-import os
-import pytest
-from subprocess import check_output
-from conftest import system_check
-
-
-def no_curlies(filepath):
-    """ Utility to make sure no curly braces appear in a file.
-        That is, was Jinja able to render everything?
-    """
-    with open(filepath, 'r') as f:
-        data = f.read()
-
-    template_strings = [
-        '{{',
-        '}}',
-        '{%',
-        '%}'
-    ]
-
-    template_strings_in_file = [s in data for s in template_strings]
-    return not any(template_strings_in_file)
-
-
-@pytest.mark.usefixtures("default_baked_project")
-class TestCookieSetup(object):
-    def test_project_name(self):
-        project = self.path
-        if pytest.param.get('project_name'):
-            name = system_check('DrivenData')
-            assert project.name == name
-        else:
-            assert project.name == 'project_name'
-
-    def test_author(self):
-        setup_ = self.path / 'setup.py'
-        args = ['python', str(setup_), '--author']
-        p = check_output(args).decode('ascii').strip()
-        if pytest.param.get('author_name'):
-            assert p == 'DrivenData'
-        else:
-            assert p == 'Your name (or your organization/company/team)'
-
-    def test_readme(self):
-        readme_path = self.path / 'README.md'
-        assert readme_path.exists()
-        assert no_curlies(readme_path)
-        # if pytest.param.get('project_name'):
-        #     with open(readme_path) as fin:
-        #         assert 'DrivenData' == next(fin).strip()
-
-    def test_setup(self):
-        setup_ = self.path / 'setup.py'
-        args = ['python', str(setup_), '--version']
-        p = check_output(args).decode('ascii').strip()
-        assert p == '0.1.0'
-
-    # def test_license(self):
-    #     license_path = self.path / 'LICENSE'
-    #     assert license_path.exists()
-    #     assert no_curlies(license_path)
-
-    # def test_license_type(self):
-    #     setup_ = self.path / 'setup.py'
-    #     args = ['python', str(setup_), '--license']
-    #     p = check_output(args).decode('ascii').strip()
-    #     if pytest.param.get('open_source_license'):
-    #         assert p == 'BSD-3'
-    #     else:
-    #         assert p == 'MIT'
-
-    def test_requirements(self):
-        reqs_path = self.path / 'requirements'/ 'environment.yaml'
-        assert reqs_path.exists()
-        assert no_curlies(reqs_path)
-        # if pytest.param.get('python_interpreter'):
-        #     with open(reqs_path) as fin:
-        #         lines = list(map(lambda x: x.strip(), fin.readlines()))
-        #     assert 'pathlib2' in lines
-
-    def test_makefile(self):
-        makefile_path = self.path / 'Makefile'
-        assert makefile_path.exists()
-        assert no_curlies(makefile_path)
-
-    def test_folders(self):
-        expected_dirs = [
-            'data',
-            # 'data/external',
-            'data/interim',
-            'data/processed',
-            'data/raw',
-            # 'docs',
-            'outputs',
-            'notebooks',
-            # 'references',
-            # 'reports',
-            # 'reports/figures',
-            'drivendata',
-            'drivendata/data',
-            'drivendata/features',
-            'drivendata/models',
-            'drivendata/visualization',
-        ]
-
-        ignored_dirs = [
-            str(self.path)
-        ]
-
-        abs_expected_dirs = [str(self.path / d) for d in expected_dirs]
-        abs_dirs, _, _ = list(zip(*os.walk(self.path)))
-        # if pytest.param.get('project_name'):
-        # print('proj',  pytest.param.get('project_name', 'project_name'))
-        # print(abs_dirs)
-        # print(set(abs_expected_dirs + ignored_dirs))
-        # 1/0
-        # print(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs))
-        assert len(set(abs_expected_dirs + ignored_dirs) - set(abs_dirs)) == 0
-
@@ -1,3 +1,11 @@
+
+# exclude data from source control by default
+/data/
+/outputs/
+
+# DotEnv configuration
+.env
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -56,9 +64,6 @@ docs/_build/
 # PyBuilder
 target/

-# DotEnv configuration
-.env
-
 # Database
 *.db
 *.rdb
@@ -75,9 +80,6 @@ target/
 # Jupyter NB Checkpoints
 .ipynb_checkpoints/

-# exclude data from source control by default
-/data/
-
 # Mac OS-specific storage files
 .DS_Store

@@ -2,49 +2,29 @@

 {{cookiecutter.description}}

-## Project Organization

-    ├── Justfile           <- Makefile with commands like `make data` or `make train`
-    ├── README.md          <- The top-level README for developers using this project.
-    ├── data
-    │   ├── 30_processed      <- The final, canonical data sets for modeling.
-    │   ├── 20_interim        <- Intermediate data that has been transformed.
-    │   └── 10_raw            <- The original, immutable data dump.
-    │
-    │
-    ├── nbs                   <- Jupyter notebooks. Namiwith creator's initials, a number (for ordering), and short `-` delimited description, e.g.
-    │                         `jqp-1.0-initial-data-exploration`.
-    │
-    ├── pyproject.toml    <- defines project dependencies and build configuration
-    ├── src                <- Source code for use in this project.
-    │   ├── __init__.py    <- Makes src a Python module
-    │   │
-    │   ├── data           <- Scripts to download or generate data
-    │   │   └── make_dataset.py
-    │   │
-    │   ├── features       <- Scripts to turn raw data into features for modeling
-    │   │   └── build_features.py
-    │   │
-    │   ├── models         <- Scripts to train models and then use trained models to make
-    │   │   │                 predictions
-    │   │   ├── predict_model.py
-    │   │   └── train_model.py
-    │   │
-    │   └── visualization  <- Scripts to create exploratory and results oriented visualizations
-           └── visualize.py
+Project status: TODO
+
+Project plan:
+
+- [x] Init
+- [ ] Fill out README
+- [ ] ???
+- [ ] Profit


 ## Install requirements

-This project uses poetry for requirement and is set up for torch using cuda.
-```
+This project uses [poetry](https://python-poetry.org/) for requirement and is set up for torch using cuda.
+~~~
 poetry install
-```
+~~~

 ## How to get data

 TODO document how to get the data

+
 ## How to run

 This project uses [just](https://github.com/casey/just)
@@ -53,6 +33,36 @@ This project uses [just](https://github.com/casey/just)
 just --list
 ~~~

+
+## Project Organization
+
+Note this project uses
+
+- [Justfile](https://github.com/casey/just): Command runner with commands like `just data` or `just train`
+- data: [data directory ](https://cookiecutter-data-science.drivendata.org/#directory-structure)
+    - ./10_raw            <- The original, immutable data dump.
+    - ./20_interim        <- Intermediate data that has been transformed.
+    - ./30_processed      <- The final, canonical data sets for modeling.
+- nbs: upyter notebooks. Name with creator's initials, a number (for ordering), and short `-` delimited description, e.g.  `jqp-1.0-initial-data-exploration`.
+- pyproject.toml:   defines poetry project dependencies and build configuration
+- {{cookiecutter.project_name}}:    Source code for use in this project.
+
+
+## How to cite
+
+~~~bibtext
+@software{wassname2024{{ cookiecutter.project_name.lower().replace(' ', '_') }},
+  author = {Clark, M.J.},
+  title = { {{cookiecutter.project_name}} },
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  url = {https://github.com/wassname/{{ cookiecutter.project_name.lower().replace(' ', '_') }} },
+  commit = {<commit hash>}
+}
+~~~
+
+
 --------

 <p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
@@ -1,4 +1,14 @@
-package := "{{cookiecutter.package_name}}"
+# see https://cheatography.com/linux-china/cheat-sheets/justfile/
+
+set dotenv-load
+
+# Export all just variables as environment variables.
+set export
+
+package := "{{cookiecutter.project_name.lower().replace(' ', '_')}}"
+
+[private]
+default: @just --list

 # put your run commands here
 app:
@@ -0,0 +1,155 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1b44551e",
+   "metadata": {},
+   "source": [
+    "# Exploratory Data Analysis\n",
+    "\n",
+    "Hypothesis: What is this notebook about?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "198de680",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-06-28T02:34:01.879987Z",
+     "start_time": "2022-06-28T02:34:01.864103Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# autoreload your package\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import {{ cookiecutter.project_name.lower().replace(' ', '_') }}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a372ed7c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-06-28T02:34:02.470436Z",
+     "start_time": "2022-06-28T02:34:02.424826Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "## secrets\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()  # take environment variables from .env.\n",
+    "\n",
+    "import warnings\n",
+    "# warnings.simplefilter(\"ignore\")\n",
+    "warnings.filterwarnings(\"ignore\", \".*does not have many workers.*\")\n",
+    "warnings.filterwarnings(\"ignore\", \".*divide by zero.*\")\n",
+    "\n",
+    "## numeric, plotting\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "plt.style.use('ggplot')\n",
+    "plt.rcParams['figure.figsize'] = (7.0, 4)\n",
+    "\n",
+    "## utils\n",
+    "from pathlib import Path\n",
+    "from tqdm.auto import tqdm\n",
+    "import logging, os, re\n",
+    "import collections, functools, itertools\n",
+    "\n",
+    "# torch\n",
+    "import pytorch_lightning as pl\n",
+    "from einops import rearrange, repeat, reduce\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "# logging\n",
+    "from loguru import logger\n",
+    "logger.remove()\n",
+    "logger.add(os.sys.stdout, level=\"ERROR\", colorize=True, format=\"<level>{time} | {message}</level>\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54a03c3a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64890012",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-06-28T02:34:02.890216Z",
+     "start_time": "2022-06-28T02:34:02.882249Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d4da6fa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d102e3d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -10,19 +10,19 @@ readme = "README.md"
 in-project=true

 [tool.poetry.dependencies]
-python = ">=3.10,<3.13"
-torch = {version = "^2.1.0+cu124", source = "pytorch"}
-simple-parsing = "^0.1.4"
-tqdm = "^4.66.1"
+python = ">=3.10"
 numpy = "^1.26.1"
 pandas = "^2.1.1"
-lightning = "^2.1.0"
 matplotlib = "^3.8.0"
-loguru = "^0.7.2"
-einops = "^0.7.0"
 scikit-learn = "^1.3.1"
-pytorch-optimizer = "^2.12.0"
-torchinfo = "^1.8.0"
+loguru = "^0.7.2"
+tqdm = "^4.66.1"
+# einops = "^0.7.0"
+# simple-parsing = "^0.1.4"
+# torch = {version = "^2.1.0+cu124", source = "pytorch"}
+# lightning = "^2.1.0"
+# pytorch-optimizer = "^2.12.0"
+# torchinfo = "^1.8.0"
 # accelerate = "^0.24.1"
 # transformers = "4.34.0"
 # accelerate = "^0.24.1"
@@ -31,11 +31,11 @@ torchinfo = "^1.8.0"
 # einops = "^0.3.1"

 [[tool.poetry.source]]
+# pytorch cuda needs to compe from another source https://python-poetry.org/docs/dependency-specification/#source-dependencies
 name = "pytorch"
 url = "https://download.pytorch.org/whl/cu124"
 priority = "explicit"

-
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.25.2"
 ipywidgets = "^8.1.3"
@@ -45,3 +45,7 @@ pylama = "^8.4.1"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+[virtualenvs]
+create = true
+in-project = true
@@ -0,0 +1,3 @@
+# 2024-06-09 16:05:45 
+
+Started project using cookiecutter data science project template.