您的位置:首页 > 其它

gff文件解析

2016-02-26 00:00 453 查看
摘要: 记录下gff文件解析方法,免得忘了。

filter_gff3_by_transcript_id.pl

#!/usr/bin/perl
# by lhtk : lhtk80t7@gmail.com
use strict;
use warnings;

use Bio::Tools::GFF;
use feature 'switch';
use experimental 'smartmatch';
#use Getopt::Long;

my $transcript_id_file = $ARGV[0];
my $gff_in = $ARGV[1];
my $gff_out = $ARGV[2];

die "根据 transcript_id 来过滤 gff3 文件。\n\nUsage:\n\t$0 transcript_id_file gff3_in gff3_out\n" unless (defined $transcript_id_file and defined $gff_in and defined $gff_out);
die "文件已存在!" if ((-e $gff_out) and ($gff_out ne 'tmp.gff3'));

open IDS, '<', $transcript_id_file or die $!;
my %transcript_ids;
my %gene_ids;
while (<IDS>) {
next if /^\s*$/;
chomp;
$transcript_ids{$_} = 1;
my $gid = (split /\./, $_)[0];
$gene_ids{$gid} = 1;
}
close IDS;
print "Get transcript_ids and gene_ids.\n\n";

# debug
# while (my($k,$v) = each %transcript_ids) {
#     print "$k => $v\n";
# }
# while (my($k,$v) = each %gene_ids) {
#     print "$k => $v\n";
# }

my $gffio_i = Bio::Tools::GFF->new(-file => $gff_in, -gff_version => 3);
my $gffio_o = Bio::Tools::GFF->new(-file => '>' . $gff_out, -gff_version => 3);

while (my $f = $gffio_i->next_feature()) {
# debug
# print $f->primary_tag(), "\n";

given ($f->primary_tag()) {
when (/predicted_gene/) {
# my $id = $f->primary_id(); # 也可以
my ($id) = $f->get_tag_values('ID');
# print "$id\n"; # debug
next unless exists $gene_ids{$id};
}
when (/mRNA/) {
my ($id) = $f->get_tag_values('ID');
next unless exists $transcript_ids{$id};
}
when (/exon|CDS/) {
my ($parent) = $f->get_tag_values('Parent');
next unless exists $transcript_ids{$parent};
}
default {
die "Do you know this? -- ", $f->primary_tag(), "\n";
}
}
$gffio_o->write_feature($f);
}
$gffio_i->close();
$gffio_o->close();

print "Done!\n";

获取 gff 文件第一列数据:
$feat->seq_id()
,更多请参考
Bio::SeqFeature::Generic
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  perl gff