3
Ud{o                 @   s   d Z ddlmZ ddlmZmZ ddlZddlZddlm	Z	m
Z
 ddlZddlmZ ddlmZ ddljjjZddlmZ ejeejd	d
dZG dd dZG dd dZG dd deejZdS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )abc)datetime	timedeltaN)EmptyDataErrorOutOfBoundsDatetime)get_filepath_or_buffer)Parser)
ReaderBase)sas_datetimesunitreturnc             C   s^   yt j| |ddS  tk
rX   |dkr6| jdd S |dkrL| jdd S tdY nX d	S )
a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originsc             S   s   t dddt| d S )Ni     )seconds)r   r   )	sas_float r   J/var/www/html/virt/lib64/python3.6/site-packages/pandas/io/sas/sas7bdat.py<lambda>7   s    z$_convert_datetimes.<locals>.<lambda>dc             S   s   t dddt| d S )Ni  r   )days)r   r   )r   r   r   r   r   ;   s    zunit must be 'd' or 's'N)pdto_datetimer   apply
ValueError)r
   r   r   r   r   _convert_datetimes    s    

r   c               @   s   e Zd ZdS )_subheader_pointerN)__name__
__module____qualname__r   r   r   r   r   A   s   r   c               @   s   e Zd ZdS )_columnN)r   r   r   r   r   r   r   r    E   s   r    c               @   s   e Zd ZdZd>ddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd?d8d9Zd:d; Zd<d= Z dS )@SAS7BDATReadera!  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : boolean, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : boolean, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    NTc	       
      C   s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|\| _}	}	}	t| jtrt| jd| _| j| _| j  | j  d S )Nzlatin-1 r   rb)indexconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textdefault_encodingcompressioncolumn_names_stringscolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointers_cached_page_column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   _path_or_buf
isinstancestropenhandle_get_properties_parse_metadata)
selfZpath_or_bufr$   r%   r&   r'   r(   r)   r*   _r   r   r   __init__g   s6    zSAS7BDATReader.__init__c             C   s   t j| jt jdS )z5Return a numpy int64 array of the column data lengths)dtype)npasarrayr3   int64)r>   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc             C   s   t j| jt jdS )z0Return a numpy int64 array of the column offsets)rA   )rB   rC   r4   rD   )r>   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc             C   s   t j| jt jddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1)rA   )rB   rC   r5   rA   )r>   r   r   r   column_types   s    zSAS7BDATReader.column_typesc             C   s(   y| j j  W n tk
r"   Y nX d S )N)r;   closeAttributeError)r>   r   r   r   rH      s    zSAS7BDATReader.closec             C   s
  | j jd | j jd| _| jdttj tjkrD| j  tdd\}}| j	tj
tj}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| j	tjtj}|tjkrtj}|| }| j	tjtj}|dkrd	| _nd
| _| j	tjtjd }|tjkr"tj| | _nd| d| _| j	tj tj!}|dkrRd| _"n|dkrdd| _"nd| _"| j	tj#tj$}|j%d| _&| j'r| j&j(| j)p| j*| _&| j	tj+tj,}|j%d| _-| j'r| j-j(| j)p| j*| _-t.ddd}| j/tj0| tj1}|t2j3|dd | _4| j/tj5| tj6}|t2j3|dd | _7| j8tj9| tj:| _;| j j| j;d }|  j|7  _t| j| j;kr| j  td| j8tj<| tj=| _>| j8tj?| tj@| _A| j	tjB| tjC}|j%d| _D| j'r| jDj(| j)p| j*| _D| j	tjE| tjF}|j%d| _G| j'rB| jGj(| j)p<| j*| _G| j	tjH| tjI}|j%d| _J| j'r| jJj(| j)p|| j*| _J| j	tjK| tjL}|j%d}t|dkr|j(| j)p| j*| _Mn@| j	tjN| tjO}|j%d| _M| j'r| jMj(| j)p | j*| _Md S )Nr   i   z'magic number mismatch (not a SAS file?)T   F      <>zunknown (code=)   1unix   2Zwindowsunknowns     i  r   r   )r   z*The SAS7BDAT file appears to be truncated.)r   r   )Pr7   seekreadr2   lenconstmagicrH   r   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64_int_lengthZpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatformZdataset_offsetZdataset_lengthrstripnamer*   decoder(   r+   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r>   Zalign1Zalign2bufZtotal_alignepochxr   r   r   r<      s    





zSAS7BDATReader._get_propertiesc             C   s"   | j | jpdd}|d krt|S )Nr   )nrows)rU   r'   StopIteration)r>   dar   r   r   __next__.  s    zSAS7BDATReader.__next__c             C   sJ   |dkr| j   td| j||}|dkr0dnd}tj| j| |d S )NrK   rJ   zinvalid float widthfr   r   )rK   rJ   )rH   r   rY   structunpackr^   )r>   offsetwidthrk   fdr   r   r   re   5  s    zSAS7BDATReader._read_floatc             C   sP   |dkr| j   td| j||}dddd	d
| }tj| j| |d }|S )Nr      rK   rJ   zinvalid int widthbhlq)r   rx   rK   rJ   r   )r   rx   rK   rJ   )rH   r   rY   rs   rt   r^   )r>   ru   rv   rk   itZivr   r   r   rf   >  s    zSAS7BDATReader._read_intc             C   s   | j d krX| jj| | jj|}t||k rT| j  d|dd|dd}t||S || t| j krz| j  td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r2   r7   rT   rU   rV   rH   r   )r>   ru   lengthrk   msgr   r   r   rY   G  s    
zSAS7BDATReader._read_bytesc             C   sZ   d}xP|sT| j j| j| _t| jdkr*P t| j| jkrJ| j  td| j }qW d S )NFr   z2Failed to read a meta data page from the SAS file.)r7   rU   rh   r2   rV   rH   r   _process_page_meta)r>   doner   r   r   r=   V  s    zSAS7BDATReader._parse_metadatac             C   sV   | j   tjtjgtj }| j|kr,| j  | jtj@ }| jtjk}|pT|pT| jg kS )N)	_read_page_headerrW   page_meta_typeZpage_amd_typepage_mix_types_current_page_type_process_page_metadatapage_data_typer1   )r>   ptis_data_pageZis_mix_pager   r   r   r   a  s    
z!SAS7BDATReader._process_page_metac             C   sX   | j }tj| }| j|tj| _tj| }| j|tj| _tj	| }| j|tj
| _d S )N)r\   rW   Zpage_type_offsetrf   Zpage_type_lengthr   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r>   
bit_offsetZtxr   r   r   r   n  s    


z SAS7BDATReader._read_page_headerc             C   st   | j }xht| jD ]Z}| jtj| |}|jdkr4q|jtjkrBq| j	|j
}| j||j|j}| j|| qW d S )Nr   )r\   ranger   _process_subheader_pointersrW   Zsubheader_pointers_offsetr   r,   Ztruncated_subheader_id_read_subheader_signatureru   _get_subheader_indexptype_process_subheader)r>   r   ipointersubheader_signaturesubheader_indexr   r   r   r   y  s    
z%SAS7BDATReader._process_page_metadatac             C   s`   t jj|}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n| j  t	d|S )Nr   r"   zUnknown subheader signature)
rW   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer,   SASIndexdata_subheader_indexrH   r   )r>   	signaturer,   r   r$   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexc       
      C   s   | j }|||  }| j|| j}|| j7 }| j|| j}|| j7 }| j|d}|d7 }| j|d}t }	||	_||	_||	_||	_|	S )Nr   )r]   rf   r[   r   ru   r   r,   r   )
r>   ru   Zsubheader_pointer_indexZsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typerm   r   r   r   r     s    

z*SAS7BDATReader._process_subheader_pointersc             C   s   | j || j}|S )N)rY   r[   )r>   ru   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signaturec             C   s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| jj| d S td||| d S )Nzunknown subheader index)ru   r   rW   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   r1   appendr   )r>   r   r   ru   r   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheaderc             C   s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| j|tj|  || _| j|tj|  || _| j|tj|  || _	| j|tj
|  || _tj| }| j|| || _| j|d| _| j|d| _d S )Ni  i  ib  iz  rx   )r[   rZ   rf   rW   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r>   ru   r   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s(    

z)SAS7BDATReader._process_rowsize_subheaderc             C   sT   | j }||7 }| j||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r[   rf   column_countr   r   print)r>   ru   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnsize_subheaderc             C   s   d S )Nr   )r>   ru   r   r   r   r   r     s    z(SAS7BDATReader._process_subheader_countsc       
      C   s  || j 7 }| j|tj}| j||}|d| jd}|}| jrR|j| jpN| j	}| j
j| t| j
dkrd}xtjD ]}||krz|}qzW || _|| j 8 }|d }	| jr|	d7 }	| j|	| j}|jd}|dkrd| _|d }	| jr|	d7 }	| j|	| j}|d| j | _n|tjkrV|d	 }	| jr6|	d7 }	| j|	| j}|d| j | _nH| jdkrd| _|d }	| jr|	d7 }	| j|	| j}|d| j | _| jrt| d
r| jj| jp| j	| _d S )Nr   s     r   r"      rK           (   creator_proc)r[   rf   rW   Ztext_block_size_lengthrY   ra   r*   rc   r(   r+   r-   r   rV   Zcompression_literalsr,   rZ   r   r   r   Zrle_compressionhasattr)
r>   ru   r   Ztext_block_sizerk   Z	cname_rawcnameZcompression_literalZclZoffset1r   r   r   r     sX    



z,SAS7BDATReader._process_columntext_subheaderc             C   s   | j }||7 }|d|  d d }xt|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| j|tj}	| j|tj	}
| j|tj
}| j|	 }| jj||
|
|   q,W d S )Nrx      rJ   r   )r[   r   rW   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetrf   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthr-   r.   r   )r>   ru   r   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_strr   r   r   r   -  s    


z,SAS7BDATReader._process_columnname_subheaderc       
      C   s   | j }|d|  d |d  }xt|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| j||}	| jj|	 | j|tj	}	| j
j|	 | j|tj}	| jj|	dkrdnd q(W d S )Nrx   r   rJ   r      d   s)r[   r   rW   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetrf   r4   r   Zcolumn_data_length_lengthr3   Zcolumn_type_lengthr5   )
r>   ru   r   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesrm   r   r   r   r   M  s    z2SAS7BDATReader._process_columnattributes_subheaderc             C   s   d S )Nr   )r>   ru   r   r   r   r   r   g  s    z,SAS7BDATReader._process_columnlist_subheaderc             C   s  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| j|tj	}
t
|
t| jd }| j|tj}| j|tj}| j|tj}t
|t| jd }| j|tj}| j|	tj}| j| }||||  }| j| }||||  }t| j}t }||_| j| |_||_||_| j| |_| j| |_| jj| | jj| d S )N   r   )r[   rW   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetrf   Z)column_format_text_subheader_index_lengthminrV   r-   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr0   r    Zcol_idr.   rb   labelformatr5   ctyper3   r   r/   r   )r>   ru   r   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenrm   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r   k  s@    





z(SAS7BDATReader._process_format_subheaderc             C   s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF| j  td| j| jkrVd S | j| j }||krn|}| jjd}| jjd}tj	||ft
d| _tj|d| ftjd| _d| _t| }|j| | j }| jd k	r|j| j}|S )Nr   zNo columns to parse from filer   r   )rA   rJ   )r'   r   rV   r5   rH   r   r6   countrB   emptyobject_string_chunkZzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rU   _chunk_to_dataframer$   Z	set_index)r>   rn   mndnsprsltr   r   r   rU     s.    

zSAS7BDATReader.readc             C   s   g | _ | jj| j| _t| jdkr(dS t| j| jkrf| j  dt| jdd| jdd}t|| j  | j	}|t
jkr| j  |t
j@ }t
jgt
j }| r| j	|kr| j S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)r1   r7   rU   rh   r2   rV   rH   r   r   r   rW   r   r   r   r   _read_next_page)r>   r   Z	page_typer   r   r   r   r   r     s"    

zSAS7BDATReader._read_next_pagec       
      C   s  | j }| j}t|| |}tj|d}d
\}}xft| jD ]V}| j| }| j| dkr| j|d d f j	| j
d d||< tj|| tjd||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q<| j| dkrx| j|d d f ||< | jrD| jd k	rD|| jj| jp<| j||< | jrn|| jj dk}	tj|j|	|f< |d7 }q<| j  td	| j|  q<W |S )N)r$   r   r   r   )rA   r   r   r   zunknown column type )r   r   ) r   r6   r   r   Z	DataFramer   r.   r5   r   viewr^   rB   rC   Zfloat64r%   r/   rW   Zsas_date_formatsr   Zsas_datetime_formatsr   r)   r(   r9   rc   r+   r&   rV   nanlocrH   r   )
r>   nr   ixr   ZjsZjbjrb   iir   r   r   r     s8    
$


z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)!r   r   r   __doc__r@   rE   rF   rG   rH   r<   rq   re   rf   rY   r=   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rU   r   r   r   r   r   r   r!   J   sJ         
$ 		4 0
"r!   )r   collectionsr   r   r   rs   ZnumpyrB   Zpandas.errorsr   r   Zpandasr   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsrW   Zpandas.io.sas.sasreaderr	   ZSeriesr9   r   r   r    Iteratorr!   r   r   r   r   <module>   s   !