diff --git a/toolkit.py b/toolkit.py index d77729c..cb92e8e 100644 --- a/toolkit.py +++ b/toolkit.py @@ -6,39 +6,55 @@ import struct from catalog import CatalogBuilder from datatypes import CTYPES, size_in_bytes -from reel_header_definition import HEADER_DEF +from encoding import guess_encoding +from binary_reel_header_definition import HEADER_DEF from ibm_float import ibm2ieee, ieee2ibm from revisions import canonicalize_revision from trace_header_definition import TRACE_HEADER_DEF -from util import file_length +from util import file_length, batched from portability import EMPTY_BYTE_STRING - -REEL_HEADER_NUM_BYTES = 3600 +TEXTUAL_HEADER_NUM_BYTES = 3200 +BINARY_HEADER_NUM_BYTES = 400 +REEL_HEADER_NUM_BYTES = TEXTUAL_HEADER_NUM_BYTES + BINARY_HEADER_NUM_BYTES TRACE_HEADER_NUM_BYTES = 240 -def extract_revision(reel_header): +def extract_revision(binary_reel_header): """Obtain the SEG Y revision from the reel header. Args: - reel_header: A dictionary containing a reel header, such as obtained - from read_reel_header() + binary_reel_header: A dictionary containing a reel header, such as obtained + from read_binary_reel_header() Returns: One of the constants revisions.SEGY_REVISION_0 or revisions.SEGY_REVISION_1 """ - raw_revision = reel_header['SegyFormatRevisionNumber'] + raw_revision = binary_reel_header['SegyFormatRevisionNumber'] return canonicalize_revision(raw_revision) -def bytes_per_sample(reel_header, revision): +def num_extended_textual_headers(binary_reel_header): + """Obtain the number of 3200 byte extended textual file headers. + + A value of zero indicates there are no Extended Textual File Header records + (i.e. this file has no Extended Textual File Header(s)). A value of -1 indicates + that there are a variable number of Extended Textual File Header records and the + end of the Extended Textual File Header is denoted by an ((SEG: EndText)) stanza + in the final record. A positive value indicates that there are exactly that many + Extended Textual File Header records. + """ + num_ext_headers = binary_reel_header['NumberOfExtTextualHeaders'] + return num_ext_headers + + +def bytes_per_sample(binary_reel_header, revision): """Determine the number of bytes per sample from the reel header. Args: - reel_header: A dictionary containing a reel header, such as obtained - from read_reel_header() + binary_reel_header: A dictionary containing a reel header, such as obtained + from read_binary_reel_header() revision: One of the constants revisions.SEGY_REVISION_0 or revisions.SEGY_REVISION_1 @@ -46,12 +62,12 @@ def bytes_per_sample(reel_header, revision): Returns: An integer number of bytes per sample. """ - dsf = reel_header['DataSampleFormat'] + dsf = binary_reel_header['DataSampleFormat'] bps = HEADER_DEF["DataSampleFormat"]["bps"][revision][dsf] return bps -def samples_per_trace(reel_header): +def samples_per_trace(binary_reel_header): """Determine the number of samples per trace from the reel header. Note: There is no requirement for all traces to be of the same length, @@ -61,16 +77,16 @@ def samples_per_trace(reel_header): trace headers. Args: - reel_header: A dictionary containing a reel header, such as obtained - from read_reel_header() + binary_reel_header: A dictionary containing a reel header, such as obtained + from read_binary_reel_header() Returns: An integer number of samples per trace """ - return reel_header['ns'] + return binary_reel_header['ns'] -def trace_length_bytes(reel_header, bps): +def trace_length_bytes(binary_reel_header, bps): """Determine the trace length in bytes from the reel header. Note: There is no requirement for all traces to be of the same length, @@ -80,22 +96,51 @@ def trace_length_bytes(reel_header, bps): trace headers. Args: - reel_header: A dictionary containing a reel header, such as obtained - from read_reel_header() + binary_reel_header: A dictionary containing a reel header, such as obtained + from read_binary_reel_header() bps: The number of bytes per sample, such as obtained from a call to bytes_per_sample() """ - return samples_per_trace(reel_header) * bps + TRACE_HEADER_NUM_BYTES + return samples_per_trace(binary_reel_header) * bps + TRACE_HEADER_NUM_BYTES -def read_reel_header(fh, endian='>'): - """Read the SEG Y reel header, also known as the binary header. +def read_textual_reel_header(fh, encoding=None): + """Read the SEG Y card image header, also known as the textual header Args: - fh: A file-like-object open in binary mode positioned such that the - beginning of the reel header will be the next byte to be read. + fh: A file-like object open in binary mode positioned such that the + beginning of the textual header will be the next byte to read. + + encoding: Optional encoding of the header in the file. If None (the + default) a reliable heuristic will be used to guess the encoding. + Typically 'cp037' for EBCDIC or 'ascii' for ASCII. + + Returns: + A tuple of forty Unicode strings (Python 2: unicode, Python 3: str) + containing the transcoded header data. + """ + raw_header = fh.read(TEXTUAL_HEADER_NUM_BYTES) + + num_bytes_read = len(raw_header) + if num_bytes_read < TEXTUAL_HEADER_NUM_BYTES: + raise EOFError("Only {} bytes of {} byte textual reel header could be read" + .format(num_bytes_read, TEXTUAL_HEADER_NUM_BYTES)) + + if encoding is None: + encoding = guess_encoding(raw_header) + + lines = tuple(bytes(raw_line).decode(encoding) for raw_line in batched(raw_header, 80)) + return lines + + +def read_binary_reel_header(fh, endian='>'): + """Read the SEG Y binary reel header. + + Args: + fh: A file-like object open in binary mode. Binary header is assumed to + be at an offset of 3200 bytes from the beginning of the file. endian: '>' for big-endian data (the standard and default), '<' for little-endian (non-standard) @@ -109,6 +154,114 @@ def read_reel_header(fh, endian='>'): return reel_header + +def has_end_text_stanza(ext_header): + """Determine whether the header is the end text stanza. + + Args: + ext_header: A sequence of forty 80 character Unicode strings. + + Returns: + True if the header is the SEG Y Revision 1 end text header, + otherwise False. + """ + return "((SEG: EndText))" in ext_header[0] + + +def read_extended_headers_until_end(fh, encoding): + """Read an unspecified number of extended textual headers, until the end-text header is found. + + Args: + fh: A file-like object open in binary mode. The first of any extended textual headers + is assumed to be at an offset of 3600 bytes from the beginning of the file + (immediately following the binary reel header). + + encoding: Optional encoding of the header in the file. If None (the + default) a reliable heuristic will be used to guess the encoding. + Typically 'cp037' for EBCDIC or 'ascii' for ASCII. + + Returns: + A list of tuples each containing forty 80-character Unicode strings. + """ + extended_headers = [] + while True: + ext_header = read_textual_reel_header(fh, encoding) + if has_end_text_stanza(ext_header): + break + extended_headers.append(ext_header) + return extended_headers + + +def read_extended_headers_counted(fh, num_expected, encoding): + """Read a specified number of extended textual headers. + + If an end-text stanza is located prematurely (in anything other than the last expected header) + reading will be terminated and a warning logged. + + Args: + fh: A file-like object open in binary mode. The first of any extended textual headers + is assumed to be at an offset of 3600 bytes from the beginning of the file + (immediately following the binary reel header). + + num_expected: A non-negative integer of headers. + + encoding: Optional encoding of the header in the file. If None (the + default) a reliable heuristic will be used to guess the encoding. + Typically 'cp037' for EBCDIC or 'ascii' for ASCII. + + Returns: + A list of tuples each containing forty 80-character Unicode strings. + """ + assert num_expected >= 0 + extended_headers = [] + for i in range(num_expected): + ext_header = read_textual_reel_header(fh, encoding) + if has_end_text_stanza(ext_header): + if i != num_expected - 1: + print("Unexpected end-text extended header") # TODO: Log this + break + extended_headers.append(ext_header) + + return extended_headers + + +def read_extended_textual_headers(fh, binary_reel_header, encoding): + """Read any extended textual reel headers. + + Args: + fh: A file-like object open in binary mode. The first of any extended textual headers + is assumed to be at an offset of 3600 bytes from the beginning of the file + (immediately following the binary reel header). + + binary_reel_header: A dictionary containing data read from the binary + reel header by the read_binary_reel_header() function. + + encoding: Optional encoding of the header in the file. If None (the + default) a reliable heuristic will be used to guess the encoding. + Typically 'cp037' for EBCDIC or 'ascii' for ASCII. + + Returns: + A Unicode string containing the concatenated contents of any extended headers. If there + were no extended headers, the string will be empty. + + Postcondition: + As a post-condition to this function, the file-pointer of fh will be + positioned immediately after the last extended textual header, which + should be the start of the first trace header. + """ + fh.seek(REEL_HEADER_NUM_BYTES) + declared_num_ext_headers = num_extended_textual_headers(binary_reel_header) + extended_headers = [] + if declared_num_ext_headers == -1: + extended_headers.extend(read_extended_headers_until_end(fh, encoding)) + elif declared_num_ext_headers > 0: + extended_headers.extend(read_extended_headers_counted(fh, declared_num_ext_headers, encoding)) + + # Concatenate the extended headers + extended_textual_header = ''.join(line for header in extended_headers for line in header).strip(' ') + return extended_textual_header + + _READ_PROPORTION = 0.75 # The proportion of time spent in catalog_traces # reading the file. Determined empirically. @@ -134,7 +287,8 @@ def catalog_traces(fh, bps, endian='>', progress=None): trace index. Args: - fh: A file-like-object open in binary mode. + fh: A file-like-object open in binary mode, positioned at the + start of the first trace header. bps: The number of bytes per sample, such as obtained by a call to bytes_per_sample() @@ -164,11 +318,12 @@ def catalog_traces(fh, bps, endian='>', progress=None): length = file_length(fh) - pos_begin = REEL_HEADER_NUM_BYTES + pos_begin = fh.tell() trace_offset_catalog_builder = CatalogBuilder() trace_length_catalog_builder = CatalogBuilder() line_catalog_builder = CatalogBuilder() + alt_line_catalog_builder = CatalogBuilder() cdp_catalog_builder = CatalogBuilder() for trace_number in itertools.count(): @@ -186,6 +341,9 @@ def catalog_traces(fh, bps, endian='>', progress=None): line_catalog_builder.add((trace_header.Inline3D, trace_header.Crossline3D), trace_number) + alt_line_catalog_builder.add((trace_header.TraceSequenceFile, + trace_header.cdp), + trace_number) cdp_catalog_builder.add(trace_header.cdp, trace_number) pos_end = pos_begin + TRACE_HEADER_NUM_BYTES + samples_bytes pos_begin = pos_end @@ -202,6 +360,12 @@ def catalog_traces(fh, bps, endian='>', progress=None): progress_callback(_READ_PROPORTION + (_READ_PROPORTION * 3 / 4)) line_catalog = line_catalog_builder.create() + + if line_catalog is None: + # Some 3D files put Inline and Crossline numbers in (TraceSequenceFile, cdp) pair + line_catalog = alt_line_catalog_builder.create() + + progress_callback(1) return (trace_offset_catalog, @@ -209,6 +373,7 @@ def catalog_traces(fh, bps, endian='>', progress=None): cdp_catalog, line_catalog) + def read_trace_header(fh, trace_header_format, pos=None): """Read a trace header. @@ -307,6 +472,21 @@ def unpack_values(buf, count, item_size, fmt, endian='>'): # swapping ourselves. +def write_binary_reel_header(fh, binary_reel_header, endian='>'): + """Write the binary_reel_header to the given file-like object. + + Args: + fh: A file-like object open in binary mode for writing. + + binary_reel_header: A dictionary of + """ + for key in HEADER_DEF: + pos = HEADER_DEF[key]['pos'] + ctype = HEADER_DEF[key]['type'] + value = binary_reel_header[key] if key in binary_reel_header else HEADER_DEF[key]['def'] + write_binary_values(fh, [value], ctype, pos) + + def write_trace_header(fh, trace_header, trace_header_format, pos=None): """Write a TraceHeader to file. @@ -385,6 +565,11 @@ def pack_values(values, fmt, endian='>'): return struct.pack(c_format, *values) +# TODO: Consider generalising the below to also produce a ReelHeader record. Then modify +# read_binary_reel_header() to return such a record, and write_binary_reel_header() to accept such +# a record. + + _TraceAttributeSpec = namedtuple('Record', ['name', 'pos', 'type']) @@ -440,3 +625,5 @@ def _compile_trace_header_record(): TraceHeader = _compile_trace_header_record() + +