From f89de23b826bd66155684ec966601243602bd81b Mon Sep 17 00:00:00 2001 From: Robert Smallshire Date: Sun, 7 Jun 2015 22:00:25 +0200 Subject: [PATCH] Adds persistent cacheing of SegYReader objects. SegYReader objects are now cached by default in a persistent store of pickles. Cached readers are located on the basis of content hashes of the SEG Y files to which they relate combined with the other arguments used to create the orginal reader. This can lead to significant speed-ups since all indexes owned by the reader instance are also cached. --- segpy/reader.py | 196 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 183 insertions(+), 13 deletions(-) diff --git a/segpy/reader.py b/segpy/reader.py index 8946d9c..4007be6 100644 --- a/segpy/reader.py +++ b/segpy/reader.py @@ -1,10 +1,12 @@ -from __future__ import print_function +import os +import pickle +from pathlib import Path + +from segpy import __version__ from segpy.encoding import ASCII from segpy.packer import make_header_packer -from segpy.sorted_set import SortedFrozenSet - from segpy.trace_header import TraceHeaderRev1 -from segpy.util import file_length, filename_from_handle, make_sorted_distinct_sequence +from segpy.util import file_length, filename_from_handle, make_sorted_distinct_sequence, hash_for_file from segpy.datatypes import DATA_SAMPLE_FORMAT_TO_SEG_Y_TYPE, SEG_Y_TYPE_DESCRIPTION, SEG_Y_TYPE_TO_CTYPE, size_in_bytes from segpy.toolkit import (extract_revision, bytes_per_sample, @@ -19,7 +21,7 @@ from segpy.toolkit import (extract_revision, guess_textual_header_encoding) -def create_reader(fh, encoding=None, trace_header_format=TraceHeaderRev1, endian='>', progress=None): +def create_reader(fh, encoding=None, trace_header_format=TraceHeaderRev1, endian='>', progress=None, cache_directory=".segpy"): """Create a SegYReader (or one of its subclasses) based on performing a scan of SEG Y data. @@ -50,6 +52,11 @@ def create_reader(fh, encoding=None, trace_header_format=TraceHeaderRev1, endian provided, this callback will be invoked at least once with an argument equal to one. + cache_directory: The directory for the cache file. Relative paths + are interpreted as being relative to the directory containing + the SEG Y file. Absolute paths are used as is. If + cache_directory is None, caching is disabled. + Raises: ValueError: ``fh`` is unsuitable for some reason, such as not being open, not being seekable, not being in @@ -94,33 +101,128 @@ def create_reader(fh, encoding=None, trace_header_format=TraceHeaderRev1, endian if endian not in ('<', '>'): raise ValueError("Unrecognised endian value {!r}".format(endian)) + reader = None + cache_file_path = None + + if cache_directory is not None: + sha1 = hash_for_file(fh, encoding, trace_header_format, endian) + seg_y_path = filename_from_handle(fh) + cache_file_path = _locate_cache_file(seg_y_path, cache_directory, sha1) + reader = _load_reader_from_cache(cache_file_path, seg_y_path) + + if reader is None: + reader = _make_reader(fh, encoding, trace_header_format, endian, progress) + if cache_directory is not None: + _save_reader_to_cache(reader, cache_file_path) + + return reader + + +def _locate_cache_file(seg_y_path, cache_directory, sha1): + """Determine the location of the cache file. + + Args: + seg_y_path: The path to the SEG Y file. + + cache_directory: The directory for the cache file. Relative paths + are interpreted as being relative to the directory containing + the SEG Y file. Absolute paths are used as is. + + sha1: The SHA1 hash corresponding to the file. + + Returns: + A Path object containing the absolute path of the cache file. + """ + cache_dir_path = Path(cache_directory) + cache_filename = (sha1 + '.p') + if cache_dir_path.is_absolute(): + cache_file_path = cache_dir_path / cache_filename + else: + normalized_seg_y_path = Path(seg_y_path).resolve() + cache_file_path = normalized_seg_y_path.parent / cache_directory / cache_filename + return cache_file_path + + +def _save_reader_to_cache(reader, cache_file_path): + """Save a reader object to a pickle file. + + Args: + reader: The Reader instance to be persisted. + cache_file_path: A Path instance giving the path to the pickle file location. + """ + cache_path = cache_file_path.parent + os.makedirs(str(cache_path), exist_ok=True) + try: + with cache_file_path.open('wb') as cache_file: + try: + pickle.dump(reader, cache_file) + except (pickle.PicklingError, TypeError) as pickling_error: + print("Could not pickle {} because {}".format(reader, pickling_error)) + pass + except OSError as os_error: + print("Could not cache {} because {}".format(reader, os_error)) + + +def _load_reader_from_cache(cache_file_path, seg_y_path): + """Attempt to load a reader object from cache. + + Any cache file could be located but not successfully read is removed. + + Args: + cache_file_path: A Path object referring to the pickle file. + + seg_y_filename: The name of the SEG Y file which the pickled reader is expected + to be able to read (used for error reporting). + + Returns: + A Reader instance associated with seg_y_filename. + + Raises: + TypeError: If the pickle could be read, but did not contain a SegYReader. + """ + + if not (cache_file_path.exists() and cache_file_path.is_file()): + return None + + with cache_file_path.open('rb') as pickle_file: + try: + reader = pickle.load(pickle_file) + except (pickle.UnpicklingError, TypeError, EOFError) as unpickling_error: + reader = None + print("Could not unpickle reader for {} because {}".format(seg_y_path, unpickling_error)) + try: + cache_file_path.unlink() + except OSError as os_error: + print("Could not remove stale cache entry {} for {} because {}" + .format(cache_file_path, seg_y_path, os_error)) + else: + print("Removed stale cache entry {} for {}".format(cache_file_path, seg_y_path)) + if not isinstance(reader, SegYReader): + raise TypeError("Pickle at {} does not contain a {} instance.".format(cache_file_path, SegYReader.__name__)) + return reader + + +def _make_reader(fh, encoding, trace_header_format, endian, progress): if encoding is None: encoding = guess_textual_header_encoding(fh) - if encoding is None: encoding = ASCII - textual_reel_header = read_textual_reel_header(fh, encoding) binary_reel_header = read_binary_reel_header(fh, endian) extended_textual_header = read_extended_textual_headers(fh, binary_reel_header, encoding) revision = extract_revision(binary_reel_header) bps = bytes_per_sample(binary_reel_header, revision) - trace_offset_catalog, trace_length_catalog, cdp_catalog, line_catalog = catalog_traces(fh, bps, trace_header_format, endian, progress) - if cdp_catalog is not None and line_catalog is None: return SegYReader2D(fh, textual_reel_header, binary_reel_header, extended_textual_header, trace_offset_catalog, trace_length_catalog, cdp_catalog, trace_header_format, encoding, endian) - if cdp_catalog is None and line_catalog is not None: return SegYReader3D(fh, textual_reel_header, binary_reel_header, extended_textual_header, trace_offset_catalog, trace_length_catalog, line_catalog, trace_header_format, encoding, endian) - return SegYReader(fh, textual_reel_header, binary_reel_header, extended_textual_header, trace_offset_catalog, trace_length_catalog, trace_header_format, encoding, endian) - class SegYReader(object): """A basic SEG Y reader. @@ -186,6 +288,61 @@ class SegYReader(object): self._bytes_per_sample = bytes_per_sample( self._binary_reel_header, self.revision) + def __getstate__(self): + """Copy the reader's state to a pickleable dictionary. + + Note: + Subclasses which have non-pickleable attributes must + override this method. + """ + if self._fh.closed: + raise TypeError("Cannot pickle {} object where file handle has been closed" + .format(self.__class__.__name__)) + + filename = filename_from_handle(self._fh) + if filename == '': + raise TypeError("Cannot pickle {} object where file handle has filename {!r}" + .format(self.__class__.__name__), filename) + file_pos = self._fh.tell() + file_mode = self._fh.mode + + state = self.__dict__.copy() + state['__version__'] = __version__ + state['_file_name'] = filename + state['_file_pos'] = file_pos + state['_file_mode'] = file_mode + del state['_fh'] + return state + + def __setstate__(self, state): + """Restore the reader's state from an unpickled dictionary. + + Note: Subclasses which have non-pickleable attributes must + override this method. + """ + if state['__version__'] != __version__: + raise TypeError("Cannot unpickle {} version {} into version {}" + .format(self.__class__.__name__, + state['__version__'], + __version__)) + del state['__version__'] + + try: + fh = open(state['_file_name'], state['_file_mode']) + except OSError as e: + raise TypeError("Cannot unpickle {} because file {} could not be opened" + .format(self.__class__.__name__, state['_filename'])) + else: + self._fh = fh + del state['_file_name'] + del state['_file_mode'] + + file_pos = state['_file_pos'] + fh.seek(file_pos) + del state['_file_pos'] + + self.__dict__.update(state) + def trace_indexes(self): """An iterator over zero-based trace_samples indexes. @@ -201,7 +358,7 @@ class SegYReader(object): def max_num_trace_samples(self): """The number of samples in the trace_samples with the most samples.""" - return self._trace_length_catalog.value_max() + return max(self._trace_length_catalog.values()) def num_trace_samples(self, trace_index): """The number of samples in the specified trace_samples.""" @@ -418,6 +575,13 @@ class SegYReader3D(SegYReader): self._inline_numbers = None self._xline_numbers = None + def __getstate__(self): + # As we're pickling, force evaluation of these properties so they'll be cached + _ = self.inline_numbers() + _ = self.xline_numbers() + state = super().__getstate__() + return state + def _dimensionality(self): return 3 @@ -550,6 +714,12 @@ class SegYReader2D(SegYReader): self._cdp_catalog = cdp_catalog self._cdp_numbers = None + def __getstate__(self): + # As we're pickling, force evaluation of these so they'll be cached + _ = self.cdp_numbers() + state = super().__getstate__() + return state + def _dimensionality(self): return 2