3
qd{o                 @   s   d Z ddlmZ ddlmZmZ ddlZddlZddlm	Z	m
Z
 ddlZddlmZ ddlmZ ddljjjZddlmZ ejeejd	d
dZG dd dZG dd dZG dd deejZdS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )abc)datetime	timedeltaN)EmptyDataErrorOutOfBoundsDatetime)get_filepath_or_buffer)Parser)
ReaderBase)sas_datetimesunitreturnc             C   s^   yt j| |ddS  tk
rX   |dkr6| jdd S |dkrL| jdd S tdY nX d	S )
a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originsc             S   s   t dddt| d S )Ni     )seconds)r   r   )	sas_float r   8/tmp/pip-build-7vycvbft/pandas/pandas/io/sas/sas7bdat.py<lambda>7   s    z$_convert_datetimes.<locals>.<lambda>dc             S   s   t dddt| d S )Ni  r   )days)r   r   )r   r   r   r   r   ;   s    zunit must be 'd' or 's'N)pdZto_datetimer   apply
ValueError)r
   r   r   r   r   _convert_datetimes    s    

r   c               @   s   e Zd ZdS )_subheader_pointerN)__name__
__module____qualname__r   r   r   r   r   A   s   r   c               @   s   e Zd ZdS )_columnN)r   r   r   r   r   r   r   r   E   s   r   c               @   s   e Zd ZdZd>ddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd?d8d9Zd:d; Zd<d= Z dS )@SAS7BDATReadera!  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : boolean, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : boolean, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    NTc	       
      C   s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|\| _}	}	}	t| jtrt| jd| _| j| _| j  | j  d S )Nzlatin-1 r   rb)indexconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textdefault_encodingcompressioncolumn_names_stringscolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointers_cached_page_column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   _path_or_buf
isinstancestropenhandle_get_properties_parse_metadata)
selfZpath_or_bufr#   r$   r%   r&   r'   r(   r)   _r   r   r   __init__g   s6    zSAS7BDATReader.__init__c             C   s   t j| jt jdS )z5Return a numpy int64 array of the column data lengths)dtype)npasarrayr2   int64)r=   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc             C   s   t j| jt jdS )z0Return a numpy int64 array of the column offsets)r@   )rA   rB   r3   rC   )r=   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc             C   s   t j| jt jddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1)r@   )rA   rB   r4   r@   )r=   r   r   r   column_types   s    zSAS7BDATReader.column_typesc             C   s(   y| j j  W n tk
r"   Y nX d S )N)r:   closeAttributeError)r=   r   r   r   rG      s    zSAS7BDATReader.closec             C   s
  | j jd | j jd| _| jdttj tjkrD| j  tdd\}}| j	tj
tj}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| j	tjtj}|tjkrtj}|| }| j	tjtj}|dkrd	| _nd
| _| j	tjtjd }|tjkr"tj| | _nd| d| _| j	tj tj!}|dkrRd| _"n|dkrdd| _"nd| _"| j	tj#tj$}|j%d| _&| j'r| j&j(| j)p| j*| _&| j	tj+tj,}|j%d| _-| j'r| j-j(| j)p| j*| _-t.ddd}| j/tj0| tj1}|t2j3|dd | _4| j/tj5| tj6}|t2j3|dd | _7| j8tj9| tj:| _;| j j| j;d }|  j|7  _t| j| j;kr| j  td| j8tj<| tj=| _>| j8tj?| tj@| _A| j	tjB| tjC}|j%d| _D| j'r| jDj(| j)p| j*| _D| j	tjE| tjF}|j%d| _G| j'rB| jGj(| j)p<| j*| _G| j	tjH| tjI}|j%d| _J| j'r| jJj(| j)p|| j*| _J| j	tjK| tjL}|j%d}t|dkr|j(| j)p| j*| _Mn@| j	tjN| tjO}|j%d| _M| j'r| jMj(| j)p | j*| _Md S )Nr   i   z'magic number mismatch (not a SAS file?)T   F      <>zunknown (code=)   1unix   2Zwindowsunknowns     i  r   r   )r   z*The SAS7BDAT file appears to be truncated.)r   r   )Pr6   seekreadr1   lenconstmagicrG   r   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64_int_lengthZpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatformZdataset_offsetZdataset_lengthrstripnamer)   decoder'   r*   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r=   Zalign1Zalign2bufZtotal_alignepochxr   r   r   r;      s    





zSAS7BDATReader._get_propertiesc             C   s"   | j | jpdd}|d krt|S )Nr   )nrows)rT   r&   StopIteration)r=   dar   r   r   __next__.  s    zSAS7BDATReader.__next__c             C   sJ   |dkr| j   td| j||}|dkr0dnd}tj| j| |d S )NrJ   rI   zinvalid float widthfr   r   )rJ   rI   )rG   r   rX   structunpackr]   )r=   offsetwidthrj   fdr   r   r   rd   5  s    zSAS7BDATReader._read_floatc             C   sP   |dkr| j   td| j||}dddd	d
| }tj| j| |d }|S )Nr      rJ   rI   zinvalid int widthbhlq)r   rw   rJ   rI   r   )r   rw   rJ   rI   )rG   r   rX   rr   rs   r]   )r=   rt   ru   rj   itZivr   r   r   re   >  s    zSAS7BDATReader._read_intc             C   s   | j d krX| jj| | jj|}t||k rT| j  d|dd|dd}t||S || t| j krz| j  td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r1   r6   rS   rT   rU   rG   r   )r=   rt   lengthrj   msgr   r   r   rX   G  s    
zSAS7BDATReader._read_bytesc             C   sZ   d}xP|sT| j j| j| _t| jdkr*P t| j| jkrJ| j  td| j }qW d S )NFr   z2Failed to read a meta data page from the SAS file.)r6   rT   rg   r1   rU   rG   r   _process_page_meta)r=   doner   r   r   r<   V  s    zSAS7BDATReader._parse_metadatac             C   sV   | j   tjtjgtj }| j|kr,| j  | jtj@ }| jtjk}|pT|pT| jg kS )N)	_read_page_headerrV   page_meta_typeZpage_amd_typepage_mix_types_current_page_type_process_page_metadatapage_data_typer0   )r=   ptis_data_pageZis_mix_pager   r   r   r   a  s    
z!SAS7BDATReader._process_page_metac             C   sX   | j }tj| }| j|tj| _tj| }| j|tj| _tj	| }| j|tj
| _d S )N)r[   rV   Zpage_type_offsetre   Zpage_type_lengthr   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r=   
bit_offsetZtxr   r   r   r   n  s    


z SAS7BDATReader._read_page_headerc             C   st   | j }xht| jD ]Z}| jtj| |}|jdkr4q|jtjkrBq| j	|j
}| j||j|j}| j|| qW d S )Nr   )r[   ranger   _process_subheader_pointersrV   Zsubheader_pointers_offsetr~   r+   Ztruncated_subheader_id_read_subheader_signaturert   _get_subheader_indexptype_process_subheader)r=   r   ipointersubheader_signaturesubheader_indexr   r   r   r   y  s    
z%SAS7BDATReader._process_page_metadatac             C   s`   t jj|}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n| j  t	d|S )Nr   r!   zUnknown subheader signature)
rV   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer+   SASIndexdata_subheader_indexrG   r   )r=   	signaturer+   r   r#   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexc       
      C   s   | j }|||  }| j|| j}|| j7 }| j|| j}|| j7 }| j|d}|d7 }| j|d}t }	||	_||	_||	_||	_|	S )Nr   )r\   re   rZ   r   rt   r~   r+   r   )
r=   rt   Zsubheader_pointer_indexZsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typerl   r   r   r   r     s    

z*SAS7BDATReader._process_subheader_pointersc             C   s   | j || j}|S )N)rX   rZ   )r=   rt   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signaturec             C   s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| jj| d S td||| d S )Nzunknown subheader index)rt   r~   rV   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   r0   appendr   )r=   r   r   rt   r~   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheaderc             C   s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| j|tj|  || _| j|tj|  || _| j|tj|  || _	| j|tj
|  || _tj| }| j|| || _| j|d| _| j|d| _d S )Ni  i  ib  iz  rw   )rZ   rY   re   rV   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r=   rt   r~   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s(    

z)SAS7BDATReader._process_rowsize_subheaderc             C   sT   | j }||7 }| j||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)rZ   re   column_countr   r   print)r=   rt   r~   r   r   r   r   r     s    z,SAS7BDATReader._process_columnsize_subheaderc             C   s   d S )Nr   )r=   rt   r~   r   r   r   r     s    z(SAS7BDATReader._process_subheader_countsc       
      C   s  || j 7 }| j|tj}| j||}|d| jd}|}| jrR|j| jpN| j	}| j
j| t| j
dkrd}xtjD ]}||krz|}qzW || _|| j 8 }|d }	| jr|	d7 }	| j|	| j}|jd}|dkrd| _|d }	| jr|	d7 }	| j|	| j}|d| j | _n|tjkrV|d	 }	| jr6|	d7 }	| j|	| j}|d| j | _nH| jdkrd| _|d }	| jr|	d7 }	| j|	| j}|d| j | _| jrt| d
r| jj| jp| j	| _d S )Nr   s     r   r!      rJ           (   creator_proc)rZ   re   rV   Ztext_block_size_lengthrX   r`   r)   rb   r'   r*   r,   r   rU   Zcompression_literalsr+   rY   r   r   r   Zrle_compressionhasattr)
r=   rt   r~   Ztext_block_sizerj   Z	cname_rawcnameZcompression_literalZclZoffset1r   r   r   r     sX    



z,SAS7BDATReader._process_columntext_subheaderc             C   s   | j }||7 }|d|  d d }xt|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| j|tj}	| j|tj	}
| j|tj
}| j|	 }| jj||
|
|   q,W d S )Nrw      rI   r   )rZ   r   rV   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetre   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthr,   r-   r   )r=   rt   r~   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_strr   r   r   r   -  s    


z,SAS7BDATReader._process_columnname_subheaderc       
      C   s   | j }|d|  d |d  }xt|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| j||}	| jj|	 | j|tj	}	| j
j|	 | j|tj}	| jj|	dkrdnd q(W d S )Nrw   r   rI   r      d   s)rZ   r   rV   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetre   r3   r   Zcolumn_data_length_lengthr2   Zcolumn_type_lengthr4   )
r=   rt   r~   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesrl   r   r   r   r   M  s    z2SAS7BDATReader._process_columnattributes_subheaderc             C   s   d S )Nr   )r=   rt   r~   r   r   r   r   g  s    z,SAS7BDATReader._process_columnlist_subheaderc             C   s  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| j|tj	}
t
|
t| jd }| j|tj}| j|tj}| j|tj}t
|t| jd }| j|tj}| j|	tj}| j| }||||  }| j| }||||  }t| j}t }||_| j| |_||_||_| j| |_| j| |_| jj| | jj| d S )N   r   )rZ   rV   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetre   Z)column_format_text_subheader_index_lengthminrU   r,   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr/   r   Zcol_idr-   ra   labelformatr4   ctyper2   r~   r.   r   )r=   rt   r~   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenrl   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r   k  s@    





z(SAS7BDATReader._process_format_subheaderc             C   s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF| j  td| j| jkrVd S | j| j }||krn|}| jjd}| jjd}tj	||ft
d| _tj|d| ftjd| _d| _t| }|j| | j }| jd k	r|j| j}|S )Nr   zNo columns to parse from filer   r   )r@   rI   )r&   r   rU   r4   rG   r   r5   countrA   emptyobject_string_chunkzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rT   _chunk_to_dataframer#   Z	set_index)r=   rm   mZndnsprsltr   r   r   rT     s.    

zSAS7BDATReader.readc             C   s   g | _ | jj| j| _t| jdkr(dS t| j| jkrf| j  dt| jdd| jdd}t|| j  | j	}|t
jkr| j  |t
j@ }t
jgt
j }| r| j	|kr| j S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)r0   r6   rT   rg   r1   rU   rG   r   r   r   rV   r   r   r   r   _read_next_page)r=   r   Z	page_typer   r   r   r   r   r     s"    

zSAS7BDATReader._read_next_pagec       
      C   s  | j }| j}t|| |}tj|d}d
\}}xft| jD ]V}| j| }| j| dkr| j|d d f j	| j
d d||< tj|| tjd||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q<| j| dkrx| j|d d f ||< | jrD| jd k	rD|| jj| jp<| j||< | jrn|| jj dk}	tj|j|	|f< |d7 }q<| j  td	| j|  q<W |S )N)r#   r   r   r   )r@   r   r   r   zunknown column type )r   r   ) r   r5   r   r   Z	DataFramer   r-   r4   r   viewr]   rA   rB   Zfloat64r$   r.   rV   Zsas_date_formatsr   Zsas_datetime_formatsr   r(   r'   r8   rb   r*   r%   rU   nanlocrG   r   )
r=   nr   ixr   ZjsZjbjra   iir   r   r   r     s8    
$


z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)!r   r   r   __doc__r?   rD   rE   rF   rG   r;   rp   rd   re   rX   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   r   r   r   r   r   r   r    J   sJ         
$ 		4 0
"r    )r   collectionsr   r   r   rr   numpyrA   Zpandas.errorsr   r   Zpandasr   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsrV   Zpandas.io.sas.sasreaderr	   ZSeriesr8   r   r   r   Iteratorr    r   r   r   r   <module>   s   !