U
    b/v                     @  s  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZ ddlm  m  mZ ddlm Z  dddddZ!ddddddZ"G dd dZ#G dd dZ$G dd de ej%Z&dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)FilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r   :/tmp/pip-unpacked-wheel-ck39h295/pandas/io/sas/sas7bdat.py_parse_datetime1   s    r   z	pd.Series)sas_datetimesr   returnc                 C  sJ   zt j| |ddW S  tk
rD   | jt|d}tt j|}| Y S X dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r   Zto_datetimer
   applyr   r   Series)r   r   Zs_seriesr   r   r   _convert_datetimes?   s    r%   c                   @  sB   e Zd ZU ded< ded< ded< ded< dddddddZd	S )
_SubheaderPointerintoffsetlengthcompressionptyper(   r)   r*   r+   c                 C  s   || _ || _|| _|| _d S Nr,   )selfr(   r)   r*   r+   r   r   r   __init___   s    z_SubheaderPointer.__init__N__name__
__module____qualname____annotations__r/   r   r   r   r   r&   Y   s
   
r&   c                   @  sV   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
ddZdS )_Columnr'   col_idzstr | bytesnamelabelformatbytesctyper)   r6   r7   r8   r9   r;   r)   c                 C  s(   || _ || _|| _|| _|| _|| _d S r-   r<   )r.   r6   r7   r8   r9   r;   r)   r   r   r   r/   n   s    
z_Column.__init__Nr0   r   r   r   r   r5   f   s   
r5   c                   @  s  e Zd ZU dZded< ded< dVdd	d
dZddddZddddZddddZddddZ	ddddZ
dd ZdddddZdddddd Zddd!d"d#Zddd$d%Zd&dd'd(Zd)d* Zddd+d,Zd-dd.d/d0Zddd1d2d3d4Zdd-d5d6d7Zdd1dd8d9d:Zdddd;d<d=Zdddd;d>d?Zdddd;d@dAZdddd;dBdCZdddd;dDdEZdddd;dFdGZdddd;dHdIZdddd;dJdKZdWdLdMdNdOdPZdQdR Z dSddTdUZ!dS )XSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r'   _int_lengthzbytes | None_cached_pageNTzFilePath | ReadBuffer[bytes])path_or_bufc	           	      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|ddd| _| jj| _z|   |   W n tk
r   |    Y nX d S )Nzlatin-1    r   rbF)Zis_text)indexconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textdefault_encodingr*   column_names_stringscolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersr?   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handleshandle_path_or_buf_get_properties_parse_metadata	Exceptionclose)	r.   r@   rC   rD   rE   rF   rG   rH   rI   r   r   r   r/      s:    
zSAS7BDATReader.__init__z
np.ndarray)r    c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrP   int64r.   r   r   r   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsr[   )r]   r^   rQ   r_   r`   r   r   r   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1r[   )r]   r^   rR   r\   r`   r   r   r   column_types   s    zSAS7BDATReader.column_typesNonec                 C  s   | j   d S r-   )rT   rZ   r`   r   r   r   rZ      s    zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkr<tdd\}}| tj	tj
}|tjkrtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrtj}|| }| tjtj}|d	krd
| _nd| _| tjtjd }|tjkrtj| | _nd| d| _| tjtj }|dkrJd| _!n|dkr\d| _!nd| _!| tj"tj#}|$d| _%| j&r| j%'| j(p| j)| _%| tj*tj+}|$d| _,| j&r| j,'| j(p| j)| _,t-ddd}| .tj/| tj0}|t1j2|dd | _3| .tj4| tj5}|t1j2|dd | _6| 7tj8| tj9| _:| j | j:d }|  j|7  _t| j| j:krtd| 7tj;| tj<| _=| 7tj>| tj?| _@| tjA| tjB}|$d| _C| j&r| jC'| j(p| j)| _C| tjD| tjE}|$d| _F| j&r2| jF'| j(p,| j)| _F| tjG| tjH}|$d| _I| j&rr| jI'| j(pl| j)| _I| tjJ| tjK}|$d}t|dkr|'| j(p| j)| _Ln@| tjM| tjN}|$d| _L| j&r| jL'| j(p| j)| _Ld S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2Zwindowsunknown     r   r   r   r"   z*The SAS7BDAT file appears to be truncated.)OrV   seekreadr?   lenconstmagicr   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueZalign_2_valueU64r>   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZendianness_offsetZendianness_length
byte_orderZencoding_offsetZencoding_lengthZencoding_namesfile_encodingZplatform_offsetZplatform_lengthplatformZdataset_offsetZdataset_lengthrstripr7   rI   decoderG   rJ   Zfile_type_offsetZfile_type_length	file_typer   _read_floatZdate_created_offsetZdate_created_lengthr   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified	_read_intZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_lengthZpage_count_offsetZpage_count_lengthZ_page_countZsas_release_offsetZsas_release_lengthZsas_releaseZsas_server_type_offsetZsas_server_type_lengthZserver_typeZos_version_number_offsetZos_version_number_length
os_versionZos_name_offsetZos_name_lengthos_nameZos_maker_offsetZos_maker_length)r.   Zalign1Zalign2bufZtotal_alignepochxr   r   r   rW      s    




        
 zSAS7BDATReader._get_propertiesc                 C  s*   | j | jpdd}|d kr&|   t|S )Nr   )nrows)rr   rF   rZ   StopIteration)r.   dar   r   r   __next__h  s
    zSAS7BDATReader.__next__)r(   widthc                 C  sJ   |dkr|    td| ||}|dkr0dnd}t| j| |d S )N)rg   rf   zinvalid float widthrg   fr   r   rZ   r   rv   structunpackrz   )r.   r(   r   r   fdr   r   r   r   p  s    zSAS7BDATReader._read_float)r(   r   r    c                 C  sP   |dkr|    td| ||}ddddd| }t| j| |d }|S )N)r      rg   rf   zinvalid int widthbhlqr   r   )r.   r(   r   r   itZivr   r   r   r   y  s    zSAS7BDATReader._read_int)r(   r)   c                 C  s   | j d krX| j| | j|}t||k rT|   d|dd|dd}t||S || t| j krz|   td| j |||  S d S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)r?   rV   rq   rr   rs   rZ   r   )r.   r(   r)   r   msgr   r   r   rv     s    
zSAS7BDATReader._read_bytesc                 C  sN   d}|sJ| j | j| _t| jdkr(qJt| j| jkr@td|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)rV   rr   r   r?   rs   r   _process_page_meta)r.   doner   r   r   rX     s    zSAS7BDATReader._parse_metadataboolc                 C  sZ   |    tjtjgtj }| j|kr,|   | jtj@ }| jtjk}t|pV|pV| j	g kS r-   )
_read_page_headerrt   page_meta_typeZpage_amd_typepage_mix_types_current_page_type_process_page_metadatapage_data_typer   rO   )r.   ptis_data_pageZis_mix_pager   r   r   r     s    
z!SAS7BDATReader._process_page_metac                 C  sX   | j }tj| }| |tj| _tj| }| |tj| _tj	| }| |tj
| _d S r-   )rx   rt   Zpage_type_offsetr   Zpage_type_lengthr   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r.   
bit_offsetZtxr   r   r   r     s    


 z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]Z}| tj| |}|jdkr2q|jtjkr@q| 	|j
}| ||j|j}| || qd S )Nr   )rx   ranger   _process_subheader_pointersrt   Zsubheader_pointers_offsetr)   r*   Ztruncated_subheader_id_read_subheader_signaturer(   _get_subheader_indexr+   _process_subheader)r.   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s"     
  z%SAS7BDATReader._process_page_metadatar:   )	signaturer    c                 C  s`   t j|}|d kr\|t jkp$|dk}|t jk}| jdkrL|rL|rLt jj}n|   t	d|S )Nr   rA   zUnknown subheader signature)
rt   Zsubheader_signature_to_indexgetZcompressed_subheader_idZcompressed_subheader_typer*   SASIndexdata_subheader_indexrZ   r   )r.   r   r*   r+   rC   f1f2r   r   r   r     s    

z#SAS7BDATReader._get_subheader_indexr&   )r(   subheader_pointer_indexr    c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )ry   r   r>   r&   )
r.   r(   r   Zsubheader_pointer_lengthZtotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typer   r   r   r   r     s     

   z*SAS7BDATReader._process_subheader_pointers)r(   r    c                 C  s   |  || j}|S r-   )rv   r>   )r.   r(   r   r   r   r   r     s    z(SAS7BDATReader._read_subheader_signature)r   r   r    c                 C  s   |j }|j}|tjjkr | j}n|tjjkr4| j}n|tjjkrH| j	}n|tjj
kr\| j}nt|tjjkrp| j}n`|tjjkr| j}nL|tjjkr| j}n8|tjjkr| j}n$|tjjkr| j| d S td||| d S )Nzunknown subheader index)r(   r)   rt   r   Zrow_size_index_process_rowsize_subheaderZcolumn_size_index_process_columnsize_subheaderZcolumn_text_index_process_columntext_subheaderZcolumn_name_index_process_columnname_subheaderZcolumn_attributes_index#_process_columnattributes_subheaderZformat_and_label_index_process_format_subheaderZcolumn_list_index_process_columnlist_subheaderZsubheader_counts_index_process_subheader_countsr   rO   appendr   )r.   r   r   r(   r)   	processorr   r   r   r     s.    z!SAS7BDATReader._process_subheader)r(   r)   r    c                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r>   rw   r   rt   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r.   r(   r)   int_lenZ
lcs_offsetZ
lcp_offsetZmxr   r   r   r     s8    
    
z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r>   r   column_countr   r   print)r.   r(   r)   r   r   r   r   r   (  s    z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r-   r   r.   r(   r)   r   r   r   r   3  s    z(SAS7BDATReader._process_subheader_countsc           
      C  s  || j 7 }| |tj}| ||}|d| d}|}| jrR|| jpN| j	}| j
| t| j
dkrd}tjD ]}||krx|}qx|| _|| j 8 }|d }	| jr|	d7 }	| |	| j}|d}|dkrd| _|d }	| jr|	d7 }	| |	| j}|d| j | _n|tjkrR|d	 }	| jr2|	d7 }	| |	| j}|d| j | _nH| jdkrd| _|d }	| jr||	d7 }	| |	| j}|d| j | _| jrt| d
r| j| jp| j	| _d S )Nr   rp   r   rA      rg           (   creator_proc)r>   r   rt   Ztext_block_size_lengthrv   r}   rI   r~   rG   rJ   rK   r   rs   Zcompression_literalsr*   rw   r   r   r   Zrle_compressionhasattr)
r.   r(   r)   Ztext_block_sizer   Z	cname_rawcnameZcompression_literalZclZoffset1r   r   r   r   6  sZ    




z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }| j||
|
|   q*d S )Nr      rf   r   )r>   r   rt   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrK   rL   r   )r.   r(   r)   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_strr   r   r   r   j  sB      
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   rf   r      d   s)r>   r   rt   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rQ   r   Zcolumn_data_length_lengthrP   Zcolumn_type_lengthrR   )
r.   r(   r)   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r   r   r   r     s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r-   r   r   r   r   r   r     s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sl  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }||||  }| j| }||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r>   rt   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminrs   rK   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthrN   r5   rL   rR   rP   rM   r   )r.   r(   r)   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr   r   r   r     sR       


	z(SAS7BDATReader._process_format_subheaderz
int | NonezDataFrame | None)r   r    c                 C  s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF|   td| j| jkrVd S | j| j }||krn|}| jd}| jd}tj	||ft
d| _tj|d| ftjd| _d| _t| }|| |  }| jd k	r|| j}|S )Nr   zNo columns to parse from filer   r   r[   rf   )rF   r   rs   rR   rZ   r	   rS   countr]   emptyobject_string_chunkzerosZuint8_byte_chunk_current_row_in_chunk_indexr   rr   _chunk_to_dataframerC   Z	set_index)r.   r   mZndnsprsltr   r   r   rr     s.    

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkr(dS t| j| jkrf|   dt| jdd| jdd}t||   | j	}|t
jkr|   |t
j@ }t
jgt
j }|s| j	|kr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rO   rV   rr   r   r?   rs   rZ   r   r   r   rt   r   r   r   r   _read_next_page)r.   r   Z	page_typer   r   r   r   r   r     s$    

zSAS7BDATReader._read_next_pager   c                 C  s  | j }| j}t|| |}i }d\}}t| jD ]\}| j| }| j| dkr| j|d d f j| jd d}	t	j
|	tj|d||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q0| j| dkrnt	j
| j|d d f |d	||< | jr<| jd k	r<|| j| jp4| j||< | jrd|| j d
k}
tj|| |
< |d7 }q0|   tdt| j|  q0t|| j|dd}|S )Nre   r   r   r[   )r\   rC   r   r   r   )rC   r   zunknown column type F)rN   rC   copy) r   rS   r   r   rL   rR   r   viewrz   r   r$   r]   Zfloat64rD   rM   rt   Zsas_date_formatsr%   Zsas_datetime_formatsr   rH   rG   r   r~   rJ   rE   rs   nanrZ   r   reprr   )r.   nr   ixr   ZjsZjbjr7   Zcol_arriiZdfr   r   r   r     s<    
 
 

z"SAS7BDATReader._chunk_to_dataframe)NTTNNTT)N)"r1   r2   r3   __doc__r4   r/   ra   rb   rc   rZ   rW   r   r   r   rv   rX   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rr   r   r   r   r   r   r   r=      sP   
       0 		
4 1"r=   )'r   
__future__r   collectionsr   r   r   r   typingr   Znumpyr]   Zpandas._typingr   r   Zpandas.errorsr	   r
   Zpandasr   r   r   Zpandas.io.commonr   Zpandas.io.sas._sasr   Zpandas.io.sas.sas_constantsioZsasZsas_constantsrt   Zpandas.io.sas.sasreaderr   r   r%   r&   r5   Iteratorr=   r   r   r   r   <module>   s&   