Проверка множественных выравниваний

Соответствие набора файлов с множественным выравниванием набору требований. Perl-скрипт
# Checks if alignment files in subdir correspond to certain requirements.
# Mandatory requirements for alignment:
#  - only CLUSTAL format!
#  - all alignments must be of the same length
# NB: assume there's only one alignment in file

use Cwd;
use Getopt::Long;  #http://perldoc.perl.org/Getopt/Long.html
use Bio::AlignIO;

sub help{
  print "Checks if alignment files in given subdir satisfy to certain requirements.\n";
  print "Assume there's only one alignment in file\n";
  print "Usage:\n";
  print " -n NUM   exact_num_organisms_to_present_in_alignment\n";
  print " -d       subdirectory with alignment files\n";
  print " -delete  set this flag to delete inappropriate alignments\n";
  print " -l NUM   minimum length of alignment\n";
  print " -e       file extension, defaults - faa\n";
  print " -f       alignment format :fasta(default), clustal\n"; 

sub process_file {
  my $file=$_[0];   # assigning parameters

  my $alnfile = new Bio::AlignIO(-format => $ALIGNFORMAT, -file => $file);
  my $aln;
  eval {       # handling possible exceptions
    $aln = $alnfile->next_aln();  #NB: assume there's only one alignment in file
  if ($@) { print "$file: error parsing alignment\n"; goto PROCESS; } 
  if(!($aln)) { print "$file: possibly non-CLUSTAL format\n"; goto PROCESS; }
  if(!($aln->is_flush)) { print "$file: not all alignments are of the same length!\n"; goto PROCESS; }
  $length = $aln->length;
  $Nsequences = $aln->no_sequences;
  if ($Nsequences<$NUMORGS) { print "$file: #sequences = $Nsequences\n"; goto PROCESS; }
  if ($length<$MIN_LENGTH)  { print "$file: length = $length\n"; goto PROCESS; }

  foreach my $seq ($aln->each_seq) {
    if(!($seq->seq =~ m/[ATGC]/))   { print "$file: all-gaps sequence\n";goto PROCESS; }

  return;     # file sucessfully passed all checkings

  PROCESS:    # file is inappropriate
    $alnfile=undef;   # we need to close file first in order to delete it
    $file=~s/\//\\/g; #replace / to \
    $file=~s/\.//;    #delete first "."
    $cmdstr="del ".$srcdir.$file;
} #process_file

#--------------------- main part --------------------------------
  $ALIGNDIR =undef;           # subdir where to search for alignments
  $NUMORGS = undef;
  $EXTENSION = undef;           # extension of alignments to process
  $ALIGNFORMAT = "fasta";       

  GetOptions ('n=i'   =>    \$NUMORGS,
              'd=s'   =>    \$ALIGNDIR,
              'delete' =>   \$DELETE_INAPPROPRIATE,
              'l=i'   =>    \$MIN_LENGTH,
              'f=s' =>      \$ALIGNFORMAT,  # possible: fasta(default), clustal 
              'e=s' =>      \$EXTENSION,  
              'help|?|h' => \$help );
  if($help) { help(); exit;} 
  if(!($NUMORGS))  { die   "please specify -n key (NUMORGS)"; }
  if(!($ALIGNDIR)) { $ALIGNDIR="alignments"; print "directory ommited, was set to: $ALIGNDIR \n"; }
  if ($DELETE_INAPPROPRIATE) {print "Deleting inappropriate alignments : on\n"; }
  if(!($EXTENSION)) { $EXTENSION="faa"; print "file extension ommited! Set to $EXTENSION\n"; }
  if (!($ALIGNFORMAT)) {$ALIGNFORMAT="fasta"; print "alignment format ommited! Set to $ALIGNFORMAT\n";} 
  if ((!($ALIGNFORMAT) eq 'fasta')||(!($ALIGNFORMAT) eq 'clustal'))
    {$ALIGNFORMAT="fasta"; print "Unknown specified alignment format! Set to $ALIGNFORMAT\n";} 

# Bioperl warns if slice of alignment contains no residues in some organism
#  => redirecting STDERR!
  open STDERR, '>', "STDERR.out" or die "Can't redirect STDERR: $!";

  $srcdir=cwd();  # cwd=get working directory
  $srcdir=~s/\//\\/g; #replace / to \

  opendir(DATADIR, "./".$ALIGNDIR."/") or die ("Error opening directory $name");
  @FILES=grep(/\.$EXTENSION$/i, readdir DATADIR);
  if($NFILES==0) { print "no alignments found in subdir $ALIGNDIR\n"; die; }
    else {  print "found $NFILES alignments\n"; }
  foreach(@FILES) { process_file("./".$ALIGNDIR.$name.'/'.$_) }
  print "#Work is finished";

Добавить комментарий

Защитный код