-
Notifications
You must be signed in to change notification settings - Fork 52
/
extract_lincRNA.py
executable file
·31 lines (25 loc) · 1.06 KB
/
extract_lincRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env
"""
Extract lincRNA coordinates from GTF
"""
import sys
import GTF
import numpy as np
import pandas as pd
def main(GENCODE):
gc = GTF.dataframe(GENCODE)
gc.gene_id = gc.gene_id.replace(to_replace=r"\.[0-9]+", value="", regex=True)
idx = (gc.feature == "transcript") & (gc.transcript_type == "lincRNA")
lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
lincRNA.to_csv("lincRNA.bed", sep="\t", header=False, index=False)
idx = (gc.feature == "gene") & (gc.gene_type == "lincRNA")
lincRNA = gc.ix[idx, ["seqname", "start", "end", "gene_id", "gene_name", "strand"]]
lincRNA.start = lincRNA.start.astype(int)
lincRNA.end = lincRNA.end.astype(int)
lincRNA.sort_values(by=["seqname", "start", "end"], inplace=True)
lincRNA.to_csv("lincRNA_genes.bed", sep="\t", header=False, index=False)
if __name__ == "__main__":
main(sys.argv[1])