From ee228e2dc9c94131ec06915019b7c9ce052d9c9a Mon Sep 17 00:00:00 2001 From: Robert Smallshire Date: Tue, 21 Oct 2014 14:54:49 +0200 Subject: [PATCH] Improved portability between Python 2 and Python 3. Beginnings of code for writing SEG Y. --- catalog.py | 8 ++-- datatypes.py | 1 + ibm_float.py | 82 ++++++++++++++++++++++++++++++--- portability.py | 32 ++++++++++++- reader.py | 9 ++-- test/__init__.py | 1 + test/test_float.py | 50 +++++++++++++++++++++ toolkit.py | 110 +++++++++++++++++++++++++++++++++++++++++++-- util.py | 4 +- 9 files changed, 274 insertions(+), 23 deletions(-) create mode 100644 test/__init__.py create mode 100644 test/test_float.py diff --git a/catalog.py b/catalog.py index f9d2877..a97eeeb 100644 --- a/catalog.py +++ b/catalog.py @@ -1,6 +1,6 @@ from collections import Mapping, Sequence, OrderedDict from fractions import Fraction -import repr +from portability import reprlib from util import contains_duplicates, measure_stride, minmax @@ -261,7 +261,7 @@ class DictionaryCatalog(Mapping): def __repr__(self): return '{}({})'.format( - self.__class__.__name__, repr.repr(self._items.items())) + self.__class__.__name__, reprlib.repr(self._items.items())) class RegularConstantCatalog(Mapping): @@ -357,7 +357,7 @@ class ConstantCatalog(Mapping): def __repr__(self): return '{}({}, {})'.format( self.__class__.__name__, - repr.repr(self._items), + reprlib.repr(self._items), self._value) @@ -424,7 +424,7 @@ class RegularCatalog(Mapping): self._key_min, self._key_max, self._key_stride, - repr.repr(self._values)) + reprlib.repr(self._values)) class LinearRegularCatalog(Mapping): diff --git a/datatypes.py b/datatypes.py index ad94441..11c1f2e 100644 --- a/datatypes.py +++ b/datatypes.py @@ -34,6 +34,7 @@ CTYPES = {'l': 'l', 'ibm': 'ibm'} + # TODO This is redundant with data in the SH_def below CTYPE_DESCRIPTION = {'ibm': 'IBM float', 'l': '32 bit signed integer', diff --git a/ibm_float.py b/ibm_float.py index 828c1cd..2808c13 100644 --- a/ibm_float.py +++ b/ibm_float.py @@ -1,6 +1,13 @@ -import sys +from __future__ import print_function -_P24 = float(pow(2, 24)) +import sys +from math import frexp, isnan, isinf +from portability import long_int, byte_string + + +_IBM_FLOAT32_BITS_PRECISION = 24 +_L24 = long_int(2) ** _IBM_FLOAT32_BITS_PRECISION +_F24 = float(pow(2, _IBM_FLOAT32_BITS_PRECISION)) if sys.version_info >= (3, 0): @@ -18,11 +25,12 @@ if sys.version_info >= (3, 0): if a == b == c == c == 0: return 0.0 - sign = 1 if (a & 0x80) else -1 + sign = -1 if (a & 0x80) else 1 exponent = a & 0x7f - mantissa = ((b << 16) | (c << 8) | d) / _P24 + mantissa = ((b << 16) | (c << 8) | d) / _F24 value = sign * mantissa * pow(16, exponent - 64) return value + else: def ibm2ieee(big_endian_bytes): @@ -42,8 +50,68 @@ else: if a == b == c == c == 0: return 0.0 - sign = 1 if (a & 0x80) else -1 + sign = -1 if (a & 0x80) else 1 exponent = a & 0x7f - mantissa = ((b << 16) | (c << 8) | d) / _P24 + mantissa = ((b << 16) | (c << 8) | d) / _F24 value = sign * mantissa * pow(16, exponent - 64) - return value \ No newline at end of file + return value + + +def ieee2ibm(f): + """Covert a float to four big-endian bytes representing an IBM float. + + Args: + f (float): The value to be converted. + + Returns: + A bytes object (Python 3) or a string (Python 2) containing four + bytes representing a big-endian IBM float. + """ + if f == 0: + # There are many potential representations of zero - this is the standard one + return b'\x00\x00\x00\x00' + + if isnan(f): + raise ValueError("NaN cannot be represented in IBM floating point") + + if isinf(f): + raise ValueError("Infinities cannot be represented in IBM floating point") + + # Now compute m and e to satisfy: + # + # f = m * 2^e + # + # where 0.5 <= abs(m) < 1 + # except when f == 0 in which case m == 0 and e == 0, which we've already + # dealt with. + m, e = frexp(f) + + # Convert the fraction (m) into an integer representation. IEEE float32 + # numbers have 23 explicit (24 implicit) bits of precision. + + mantissa = abs(long_int(m * _L24)) + exponent = e + sign = 0x80 if f < 0 else 0x00 + + # IBM single precision floats are of the form + # (-1)^sign * 0.significand * 16^(exponent-64) + + # Adjust the exponent, and the mantissa in sympathy so it is + # a multiple of four, so it can be expressed in base 16 + remainder = exponent % 4 + if remainder != 0: + shift = 4 - remainder + mantissa >>= shift + exponent += shift + + exponent_16 = exponent >> 2 # Divide by four to convert to base 16 + exponent_16_biased = exponent_16 + 64 # Add the exponent bias of 64 + + # TODO: I'm not entirely sure we're producing properly normalised representations. + + a = sign | exponent_16_biased + b = (mantissa >> 16) & 0xff + c = (mantissa >> 8) & 0xff + d = mantissa & 0xff + + return byte_string((a, b, c, d)) diff --git a/portability.py b/portability.py index 0e8a758..ba975e2 100644 --- a/portability.py +++ b/portability.py @@ -1,4 +1,7 @@ import os +import sys + +EMPTY_BYTE_STRING = b'' if sys.version_info >= (3, 0) else '' def seekable(fh): @@ -21,4 +24,31 @@ def seekable(fh): fh.seek(pos) except AttributeError: return False - return True \ No newline at end of file + return True + + +if sys.version_info >= (3, 0): + long_int = int +else: + long_int = long + + +if sys.version_info >= (3, 0): + def byte_string(integers): + return bytes(integers) +else: + def byte_string(integers): + return ''.join(chr(i) for i in integers) + + +if sys.version_info >= (3, 0): + import reprlib + reprlib = reprlib # Keep the static analyzer happy +else: + import repr as reprlib + +if sys.version_info >= (3, 0): + izip = zip +else: + from itertools import izip + izip = izip # Keep the static analyzer happy diff --git a/reader.py b/reader.py index 17e23c4..e061526 100644 --- a/reader.py +++ b/reader.py @@ -6,10 +6,10 @@ from datatypes import DATA_SAMPLE_FORMAT, CTYPE_DESCRIPTION, CTYPES, size_in_byt from toolkit import (extract_revision, bytes_per_sample, read_reel_header, + read_trace_header, catalog_traces, read_binary_values, compile_trace_header_format, - TraceHeader, REEL_HEADER_NUM_BYTES, TRACE_HEADER_NUM_BYTES) @@ -60,7 +60,7 @@ def create_reader(fh, endian='>', progress=None): print(reader.num_traces()) """ - if fh.encoding is not None: + if hasattr(fh, 'encoding') and fh.encoding is not None: raise TypeError( "SegYReader must be provided with a binary mode file object") @@ -217,10 +217,7 @@ class SegYReader(object): if not (0 <= trace_index < self.num_traces()): raise ValueError("Trace index {} out of range".format(trace_index)) pos = self._trace_offset_catalog[trace_index] - self._fh.seek(pos) - data = self._fh.read(TRACE_HEADER_NUM_BYTES) - trace_header = TraceHeader._make( - self._trace_header_format.unpack(data)) + trace_header = read_trace_header(self._fh, pos) return trace_header @property diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/test/__init__.py @@ -0,0 +1 @@ + diff --git a/test/test_float.py b/test/test_float.py new file mode 100644 index 0000000..b553c5d --- /dev/null +++ b/test/test_float.py @@ -0,0 +1,50 @@ +import unittest +from ibm_float import ieee2ibm, ibm2ieee + + +class Ibm2Ieee(unittest.TestCase): + + def test_zero(self): + self.assertEqual(ibm2ieee(b'\0\0\0\0'), 0.0) + + def test_positive_half(self): + self.assertEqual(ibm2ieee((0b11000000, 0x80, 0x00, 0x00)), -0.5) + + def test_negative_half(self): + self.assertEqual(ibm2ieee((0b01000000, 0x80, 0x00, 0x00)), 0.5) + + def test_one(self): + self.assertEqual(ibm2ieee(b'\x41\x10\x00\x00'), 1.0) + + def test_negative_118_625(self): + # Example taken from Wikipedia http://en.wikipedia.org/wiki/IBM_Floating_Point_Architecture + self.assertEqual(ibm2ieee((0b11000010, 0b01110110, 0b10100000, 0b00000000)), -118.625) + + + +class Ieee2Ibm(unittest.TestCase): + + def test_zero(self): + self.assertEqual(ieee2ibm(0.0), b'\0\0\0\0') + + def test_positive_half(self): + self.assertEqual(ieee2ibm(-0.5), bytes((0b11000000, 0x80, 0x00, 0x00))) + + def test_negative_half(self): + self.assertEqual(ieee2ibm(0.5), bytes((0b01000000, 0x80, 0x00, 0x00))) + + def test_one(self): + self.assertEqual(ieee2ibm(1.0), b'\x41\x10\x00\x00') + + def test_negative_118_625(self): + # Example taken from Wikipedia http://en.wikipedia.org/wiki/IBM_Floating_Point_Architecture + self.assertEqual(ieee2ibm(-118.625), bytes((0b11000010, 0b01110110, 0b10100000, 0b00000000))) + + def test_0_1(self): + # Note, this is different from the Wikipedia example, but I'll stick my neck out and say + # Wikipedia is wrong. + self.assertEqual(ieee2ibm(0.1), bytes((0b01000000, 0b00011001, 0b10011001, 0b10011001))) + + +if __name__ == '__main__': + unittest.main() diff --git a/toolkit.py b/toolkit.py index 8ecba83..d77729c 100644 --- a/toolkit.py +++ b/toolkit.py @@ -7,10 +7,11 @@ import struct from catalog import CatalogBuilder from datatypes import CTYPES, size_in_bytes from reel_header_definition import HEADER_DEF -from ibm_float import ibm2ieee +from ibm_float import ibm2ieee, ieee2ibm from revisions import canonicalize_revision from trace_header_definition import TRACE_HEADER_DEF from util import file_length +from portability import EMPTY_BYTE_STRING REEL_HEADER_NUM_BYTES = 3600 @@ -208,16 +209,36 @@ def catalog_traces(fh, bps, endian='>', progress=None): cdp_catalog, line_catalog) +def read_trace_header(fh, trace_header_format, pos=None): + """Read a trace header. + + Args: + fh: A file-like-object open in binary mode. + + trace_header_format: A Struct object, such as obtained from a + call to compile_trace_header_format() + + pos: The file offset in bytes from the beginning from which the data + is to be read. + + Returns: + A TraceHeader object. + """ + if pos is not None: + fh.seek(pos) + data = fh.read(TRACE_HEADER_NUM_BYTES) + trace_header = TraceHeader._make( + trace_header_format.unpack(data)) + return trace_header -def read_binary_values(fh, pos, ctype='l', count=1, endian='>'): +def read_binary_values(fh, pos=None, ctype='l', count=1, endian='>'): """Read a series of values from a binary file. Args: fh: A file-like-object open in binary mode. - pos: The file offset in bytes from the beginning from which the data - is to be read. +c ctype: The SEG Y data type. @@ -270,6 +291,9 @@ def unpack_values(buf, count, item_size, fmt, endian='>'): fmt: A format code (one of the values in the datatype.CTYPES dictionary) + endian: '>' for big-endian data (the standard and default), '<' + for little-endian (non-standard) + Returns: A sequence of objects with type corresponding to the format code. """ @@ -283,6 +307,84 @@ def unpack_values(buf, count, item_size, fmt, endian='>'): # swapping ourselves. +def write_trace_header(fh, trace_header, trace_header_format, pos=None): + """Write a TraceHeader to file. + + Args: + fh: A file-like object open in binary mode for writing. + + trace_header: A TraceHeader object. + + trace_header_format: A Struct object, such as obtained from a + call to compile_trace_header_format() + + pos: An optional file offset in bytes from the beginning of the + file. Defaults to the current file position. + """ + if pos is not None: + fh.seek(pos, os.SEEK_SET) + buf = trace_header_format.pack(trace_header) + fh.write(buf) + + +def write_trace_values(fh, values, ctype='l', pos=None): + write_binary_values(fh, values, ctype, pos) + + +def write_binary_values(fh, values, ctype='l', pos=None): + """Write a series on values to a file. + + Args: + fh: A file-like-object open for writing in binary mode. + + values: An iterable series of values. + + ctype: The SEG Y data type. + + pos: An optional offset from the beginning of the file. If omitted, + any writing is done at the current file position. + """ + fmt = CTYPES[ctype] + + if pos is not None: + fh.seek(pos, os.SEEK_SET) + + buf = (pack_ibm_floats(values) + if fmt == 'ibm' + else pack_values(values, fmt)) + + fh.write(buf) + + +def pack_ibm_floats(values): + """Pack floats into binary-encoded big-endian single-precision IBM floats. + + Args: + values: An iterable series of numeric values. + + Returns: + A sequence of bytes. (Python 2 - a str object, Python 3 - a bytes + object) + """ + return EMPTY_BYTE_STRING.join(ieee2ibm(value) for value in values) + + +def pack_values(values, fmt, endian='>'): + """Pack values into binary encoded big-endian byte strings. + + Args: + values: An iterable series of values. + + fmt: A format code (one of the values in the datatype.CTYPES + dictionary) + + endian: '>' for big-endian data (the standard and default), '<' + for little-endian (non-standard) + """ + c_format = '{}{}{}'.format(endian, len(values), fmt) + return struct.pack(c_format, *values) + + _TraceAttributeSpec = namedtuple('Record', ['name', 'pos', 'type']) diff --git a/util.py b/util.py index 956e30b..ac74425 100644 --- a/util.py +++ b/util.py @@ -2,6 +2,8 @@ import itertools import time import os +from portability import izip + def pairwise(iterable): """Pairwise iteration. @@ -14,7 +16,7 @@ def pairwise(iterable): """ a, b = itertools.tee(iterable) next(b, None) - return itertools.izip(a, b) + return izip(a, b) def contains_duplicates(sorted_iterable):