Обновлено 03.03.2009 Автор: Administrator
Соответствие набора файлов с множественным выравниванием набору требований. Perl-скрипт
# Checks if alignment files in subdir correspond to certain requirements.
# Mandatory requirements for alignment:
# - only CLUSTAL format!
# - all alignments must be of the same length
# NB: assume there's only one alignment in file
use Cwd;
use Getopt::Long; #http://perldoc.perl.org/Getopt/Long.html
use Bio::AlignIO;
sub help{
print "Checks if alignment files in given subdir satisfy to certain requirements.\n";
print "Assume there's only one alignment in file\n";
print "Usage:\n";
print " -n NUM exact_num_organisms_to_present_in_alignment\n";
print " -d subdirectory with alignment files\n";
print " -delete set this flag to delete inappropriate alignments\n";
print " -l NUM minimum length of alignment\n";
print " -e file extension, defaults - faa\n";
print " -f alignment format :fasta(default), clustal\n";
}
sub process_file {
my $file=$_[0]; # assigning parameters
my $alnfile = new Bio::AlignIO(-format => $ALIGNFORMAT, -file => $file);
my $aln;
eval { # handling possible exceptions
$aln = $alnfile->next_aln(); #NB: assume there's only one alignment in file
};
if ($@) { print "$file: error parsing alignment\n"; goto PROCESS; }
if(!($aln)) { print "$file: possibly non-CLUSTAL format\n"; goto PROCESS; }
if(!($aln->is_flush)) { print "$file: not all alignments are of the same length!\n"; goto PROCESS; }
$length = $aln->length;
$Nsequences = $aln->no_sequences;
if ($Nsequences<$NUMORGS) { print "$file: #sequences = $Nsequences\n"; goto PROCESS; }
if ($length<$MIN_LENGTH) { print "$file: length = $length\n"; goto PROCESS; }
foreach my $seq ($aln->each_seq) {
if(!($seq->seq =~ m/[ATGC]/)) { print "$file: all-gaps sequence\n";goto PROCESS; }
}
return; # file sucessfully passed all checkings
PROCESS: # file is inappropriate
if ($DELETE_INAPPROPRIATE){
$alnfile=undef; # we need to close file first in order to delete it
$file=~s/\//\\/g; #replace / to \
$file=~s/\.//; #delete first "."
$cmdstr="del ".$srcdir.$file;
system($cmdstr);
}
} #process_file
#--------------------- main part --------------------------------
# GLOBAL VARIABLES
$ALIGNDIR =undef; # subdir where to search for alignments
$NUMORGS = undef;
$DELETE_INAPPROPRIATE=undef;
$MIN_LENGTH=undef;
$EXTENSION = undef; # extension of alignments to process
$ALIGNFORMAT = "fasta";
GetOptions ('n=i' => \$NUMORGS,
'd=s' => \$ALIGNDIR,
'delete' => \$DELETE_INAPPROPRIATE,
'l=i' => \$MIN_LENGTH,
'f=s' => \$ALIGNFORMAT, # possible: fasta(default), clustal
'e=s' => \$EXTENSION,
'help|?|h' => \$help );
if($help) { help(); exit;}
if(!($NUMORGS)) { die "please specify -n key (NUMORGS)"; }
if(!($ALIGNDIR)) { $ALIGNDIR="alignments"; print "directory ommited, was set to: $ALIGNDIR \n"; }
if ($DELETE_INAPPROPRIATE) {print "Deleting inappropriate alignments : on\n"; }
if(!($EXTENSION)) { $EXTENSION="faa"; print "file extension ommited! Set to $EXTENSION\n"; }
if (!($ALIGNFORMAT)) {$ALIGNFORMAT="fasta"; print "alignment format ommited! Set to $ALIGNFORMAT\n";}
if ((!($ALIGNFORMAT) eq 'fasta')||(!($ALIGNFORMAT) eq 'clustal'))
{$ALIGNFORMAT="fasta"; print "Unknown specified alignment format! Set to $ALIGNFORMAT\n";}
# Bioperl warns if slice of alignment contains no residues in some organism
# => redirecting STDERR!
open STDERR, '>', "STDERR.out" or die "Can't redirect STDERR: $!";
$srcdir=cwd(); # cwd=get working directory
$srcdir=~s/\//\\/g; #replace / to \
opendir(DATADIR, "./".$ALIGNDIR."/") or die ("Error opening directory $name");
@FILES=grep(/\.$EXTENSION$/i, readdir DATADIR);
$NFILES=@FILES;
if($NFILES==0) { print "no alignments found in subdir $ALIGNDIR\n"; die; }
else { print "found $NFILES alignments\n"; }
foreach(@FILES) { process_file("./".$ALIGNDIR.$name.'/'.$_) }
closedir(DATADIR);
print "#Work is finished";
< Предыдущая | Следующая > |
---|