Data Cleaning: Check the value within therange or out of the range
- To Elimenate duplicate values and missing values
- Report the data in RTF or third party files
data dm;
infile 'C:\Documents and Settings\mobileclub\Desktop\source\DLM\edit-2.txt';
input sno $ pid $ age gender $ race $ color $ crfno;
run;
/*preparing template*/
data _null_;
file'D:\dcm.rtf';
put @5'dataset' @15'variable' @30'obsno' @45'value' @60'datacheck';
run;
/*upload invalid data structures*/
data _null_;
set dm (keep=sno);
file'D:\dcm.rtf' mod;
if length(sno) ne 4 then
put@5'dm' @15'sno' @30_n_ @45' sno'@60'sno 4digits';
run;
data _null_;
set dm (keep=sno);
file'D:\dcm.rtf' mod;
if substr(sno,1,1) ne 'S' then
put@5'dm' @15'sno' @30_n_ @45' sno'@60'sno start with S';
run;
data _null_;
set dm (keep=gender);
file'D:\dcm.rtf' mod;
if gender notin('F''M') then
put@5'dm' @15'gender' @30_n_ @45'gender'@60'gender be F or M';
run;
/*data check specification*/
data invalid3;
set dm(keep=sno crfno);
if length (sno)ne 4 then do;
variable='sno';
obsno=_n_;
value=sno;
crfno=crfno;
dc='must be 4digits';
end;
else if substr(sno,1,1) ne 'S' then do;
variable='sno';
obsno=_n_;
crfno=crfno;
end;
else delete;
drop sno;
run;
proc print data=invalid3;
run;
data invalid4;
set dm(keep=age gender crfno);
if age<18 then do;
variable='age';
obsno=_n_;
value=age;
crfno=crfno;
dc='must be >=18';
end;
else if gender ne'f' and gender ne'm' then do;
variable='gender';
obsno=_n_;
value=gender;
crfno=crfno;
dc='must be f or m';
end;
else delete;
drop age gender;
run;
proc print data=invalid4;
run;
data invalid5;
set dm(keep=age gender crfno);
if age<18 then do;
variable='age';
obsno=_n_;
value=age;
crfno=crfno;
dc='must be >=18';
end;
else if gender ne'F' and gender ne'M' then do;
variable='gender';
obsno=_n_;
value=gender;
crfno=crfno;
dc='must be f or m';
end;
else delete;
drop age gender;
run;
proc print data=invalid5;
run;
/*conversions*/
data dm5;
set dm;
sno=upcase(sno);
pid=upcase(pid);
gender=lowcase(gender);
race=upcase(race);
color=upcase(color);
if length (sno)<4 then sno=substr(sno,1,1)||'0'||substr(sno,2);
if length (pid)<4 then pid=substr(pid,1,1)||'0'||substr(pid,2);
if gender='2' then gender='f';
if gender='1' then gender='m';
run;
proc print data=dm5;
run;
No comments:
Post a Comment