3
Udo8                 @   s  d Z ddlmZ ddlmZ ddlZddlZddlZddlm	Z	 ddl
ZddlmZ ddlmZ dZd	Zd
ZdZddddddddddddddddgZdZdZdZdZd e d!e d!e d!e d"	Zd#e d!e d$Zd%Zeed&d'd(Zed)d*d+Zd,d- Zd.d/ Z G d0d1 d1eej!Z"dS )2z
Read a SAS XPort format file into a Pandas DataFrame.

Based on code from Jack Cushman (github.com/jcushman/xport).

The file format is defined here:

https://support.sas.com/techsup/technote/ts140.pdf
    )abc)datetimeN)Appender)get_filepath_or_buffer)
ReaderBasezPHEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000  zKHEADER RECORD*******MEMBER  HEADER RECORD!!!!!!!000000000000000001600000000zPHEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000  zPHEADER RECORD*******OBS     HEADER RECORD!!!!!!!000000000000000000000000000000  ntypeZnhfunfield_lengthZnvar0namelabelZnformZnflZnum_decimalsZnfjZnfillZniformZniflZnifdZnpos_zParameters
----------
filepath_or_buffer : string or file-like object
    Path to SAS file or object implementing binary read method.zindex : identifier of index column
    Identifier of column that should be used as index of the DataFrame.
encoding : string
    Encoding for text data.
chunksize : int
    Read file `chunksize` lines at a time, returns iterator.zEformat : string
    File format, only `xport` is currently supported.z_iterator : boolean, default False
    Return XportReader object for reading file incrementally.z#Read a SAS file into a DataFrame.


a  

Returns
-------
DataFrame or XportReader

Examples
--------
Read a SAS Xport file:

>>> df = pd.read_sas('filename.XPT')

Read a Xport file in 10,000 line chunks:

>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>>     do_something(chunk)

z$Class for reading SAS Xport files.

z

Attributes
----------
member_info : list
    Contains information about the file
fields : list
    Contains information about the variables in the file
zRead observations from SAS Xport file, returning as data frame.

Parameters
----------
nrows : int
    Number of rows to read from data file; if None, read whole
    file.

Returns
-------
A DataFrame.
)datestrreturnc             C   s(   yt j| dS  tk
r"   tjS X dS )z3 Given a date in xport format, return Python date. z%d%b%y:%H:%M:%SN)r   strptime
ValueErrorpdZNaT)r    r   K/var/www/html/virt/lib64/python3.6/site-packages/pandas/io/sas/sas_xport.py_parse_date   s    r   )sc             C   sD   i }d}x0|D ](\}}| |||  j  ||< ||7 }qW |d= |S )a  
    Parameters
    ----------
    s: str
        Fixed-length string to split
    parts: list of (name, length) pairs
        Used to break up string, name '_' will be filtered from output.

    Returns
    -------
    Dict of name:contents of string at given location.
    r   r   )strip)r   partsoutstartr	   lengthr   r   r   _split_line   s    r   c             C   sT   |dkrPt jt| t jd}t jd| dd|  }|j|d}| |d< |S | S )N   ZS8Sz,S)dtypef0)npzeroslenr   view)vecnbytesvec1r   Zvec2r   r   r   _handle_truncated_float_vec   s    	r'   c       	      C   s  t jd}| j|d}|d }|d }|d@ }t jt| t jd}d|t j|d@ < d|t j|d	@ < d
|t j|d@ < ||L }||? |d@ dd
|  > B }|dM }||d? d@ d d> | d d> |d@ B O }t jt|fdd}||d< ||d< |jdd}|jd}|S )zf
    Parse a vector of float values representing IBM 8 byte floats into
    native 8 byte floats.
    z>u4,>u4)r   r   f1i    i       i  @    i         l          A   i     l        z>f8Zf8)	r    r   r#   r!   r"   Zuint8whereemptyZastype)	r$   r   r&   Zxport1Zxport2Zieee1shiftZieee2Zieeer   r   r   _parse_float_vec   s(    
		 
r5   c               @   sl   e Zd ZeZdddZdd Zdd Zd	d
 Zdd Z	e
dddZdddZdd ZeedddZdS )XportReaderN
ISO-8859-1c             C   sd   || _ d| _|| _|| _t|tr6t||d\}}}}t|ttfrRt|d| _	n|| _	| j
  d S )Nr   )encodingrb)	_encoding_lines_read_index
_chunksize
isinstancestrr   bytesopenfilepath_or_buffer_read_header)selfrB   indexr8   	chunksizecompressionZshould_closer   r   r   __init__   s    
zXportReader.__init__c             C   s   | j j  d S )N)rB   close)rD   r   r   r   rI     s    zXportReader.closec             C   s   | j jdj S )NP   )rB   readdecode)rD   r   r   r   _get_row  s    zXportReader._get_rowc             C   sd  | j jd | j }|tkr,| j  td| j }ddgddgddgddgd	d
gg}t||}|d dkr|| j  tdt|d	 |d	< || _| j }t|d d
 |d< | j }| j }|j	t
}|tk}	|o|	s| j  tdt|d&d' }
ddgddgddgddgddgddgd	d
gg}t| j |}dd
gdd
gddgddgg}|jt| j | t|d |d< t|d	 |d	< || _ddd}t| j dd }|
| }|d r|d|d  7 }| j j|}g }d}xt||
kr|d |
 ||
d   }}|jd}tjd|}ttt|}|d= ||d  |d< |d }|d dkr|dk sn|dkr| j  d | d!}t|x>|j D ]2\}}y|j ||< W n tk
r   Y nX qW ||d 7 }||g7 }qW | j }|tks| j  td"|| _|| _| j j | _| j  | _!d#d$ | jD | _"d%d$ t#| jD }t$j%|}|| _&d S )(Nr   z#Header record is not an XPORT file.prefixr.   versionr   ZOSr   created   zSAS     SAS     SASLIBz!Header record has invalid prefix.modifiedzMember header not found   r*   set_nameZsasdatar
   (   typenumericchar)r)   r*   6   :   rJ      z>hhhh8s40s8shhh2s8shhl52sr   r   zFloating field width z is not between 2 and 8.zObservation header not found.c             S   s   g | ]}|d  j  qS )r	   )rL   ).0xr   r   r   
<listcomp>y  s    z,XportReader._read_header.<locals>.<listcomp>c             S   s,   g | ]$\}}d t | dt |d  fqS )r   r   r   )r?   )r\   ifieldr   r   r   r^   }  s   )'rB   seekrM   _correct_line1rI   r   r   r   	file_info
startswith_correct_header1_correct_header2intupdatemember_inforK   r"   ljuststructunpackdictzip
_fieldkeys	TypeErroritemsr   AttributeError_correct_obs_headerfieldsrecord_lengthtellrecord_start_record_countnobscolumns	enumerater    r   _dtype)rD   Zline1Zline2Zfifre   Zline3Zheader1Zheader2Z	headflag1Z	headflag2ZfieldnamelengthZmemrk   typesZ
fieldcountZ
datalengthZ	fielddatarv   Z
obs_lengthr`   ZfieldstructflmsgkvheaderZdtypelr   r   r   r   rC     s    "






"


zXportReader._read_headerc             C   s   | j | jpddS )Nr)   )nrows)rK   r=   )rD   r   r   r   __next__  s    zXportReader.__next__)r   c             C   s   | j jdd | j j | j }|d dkr4tjd | jdkrV| j j| j || j S | j jdd | j jd}tj	|tj
d}tj|dk}t|dkrd}ndt| }| j j| j || | j S )	z
        Get number of records in file.

        This is maybe suboptimal because we have to seek to the end of
        the file.

        Side effect: returns file position to record_start.
        r   r*   rJ   zxport file may be corrupted)r   l     @@  r   i)rB   rc   rx   ry   warningswarnrw   rK   r    
frombufferZuint64Zflatnonzeror"   )rD   Ztotal_records_lengthZ	last_cardixZtail_padr   r   r   rz     s     	


zXportReader._record_countc             C   s   |dkr| j }| j|dS )a  
        Reads lines from Xport file and returns as dataframe

        Parameters
        ----------
        size : int, defaults to None
            Number of lines to read.  If None, reads whole file.

        Returns
        -------
        DataFrame
        N)r   )r=   rK   )rD   sizer   r   r   	get_chunk  s    zXportReader.get_chunkc             C   sl   |j dd}|d dk|d dk@ |d dk@ }|d dk|d d	k@ |d d
kB |d dkB }||M }|S )Nzu1,u1,u2,u4)r   r(   r   f2Zf3r   r0   Z   _   .   )r#   )rD   r$   r   missZmiss1r   r   r   _missing_double  s
    $0zXportReader._missing_doublec                sd  |d kr j }t| j  j }| j }|dkr> j  t jj|}tj	| j
|d}tjt|d}xt jD ]\}}|dt|  }	 j| d }
|
dkrt|	 j| d }	 j|	}t|	}tj||< n@ j| d dkrd	d
 |	D } jd k	r fdd
|D }|||< qxW  jd krFt j j| |_n|j j}  j|7  _|S )Nr   )r   count)rE   r   r   rW   r   rX   c             S   s   g | ]}|j  qS r   )rstrip)r\   yr   r   r   r^     s    z$XportReader.read.<locals>.<listcomp>c                s   g | ]}|j  jqS r   )rL   r:   )r\   r   )rD   r   r   r^     s    )r{   minr;   rw   rI   StopIterationrB   rK   r    r   r~   r   Z	DataFrameranger}   r|   r?   rv   r'   r   r5   nanr:   r<   rE   Z	set_index)rD   r   
read_linesZread_lenrawdatadfjr]   r$   r   r   r   r   )rD   r   rK     s8    

zXportReader.read)Nr7   N)N)N)__name__
__module____qualname___xport_reader_doc__doc__rH   rI   rM   rC   r   ri   rz   r   r   r   _read_method_docrK   r   r   r   r   r6      s   
m#
r6   )#r   collectionsr   r   rm   r   Znumpyr    Zpandas.util._decoratorsr   Zpandasr   Zpandas.io.commonr   Zpandas.io.sas.sasreaderr   rd   rg   rh   ru   rq   Z_base_params_docZ_params2_docZ_format_params_docZ_iterator_docZ_read_sas_docr   r   r?   r   r   r'   r5   Iteratorr6   r   r   r   r   <module>	   sR   	9