bedToExons knownGene32_12.bed knownGene32_exons.bed
#if thickstart = thickend, there is no coding. This may seem to be messy but comparison of ENST records with biotypes from GENCODE gff at end confirms this is true.
awk '$7 == $8' knownGene32_12.bed > ncRNA_knownGene32_temp.bed
#uniqueToFileOne is a short one liner: diff -U $(wc -l < $1) $1 $2 | sed -n 's/^-//p' that returns the contents of file1 that are not in file2
uniqueToFileOne.sh knownGene32_12.bed ncRNA_knownGene32_temp.bed | tail -n+2 > cds_knownGene32_12.bed
#make exons from bed12
bedToExons ncRNA_knownGene32_temp.bed ncRNA_knownGene32_exons.bed
bedToExons -cdsOnly cds_knownGene32_12.bed temp.bed
bedToExons cds_knownGene32_12.bed knownGene32_exons.bed
#thickStart=thickEnd is detected as "coding" by bedToExons, we don't want those in our list of "coding" exons they are ncRNA
awk '$2 != $3' temp.bed > cds_knownGene32_exons.bed
#subtract the coding portions of exons from all exons (no ncRNA) to get the UTRs of coding transcripts
bedtools subtract -a knownGene32_exons.bed -b cds_knownGene32_exons.bed > utr_knownGene32_exons.bed
#lift all 3 files
liftOver ncRNA_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_ncRNA_knownGene32_12.bed RepBro_ncRNA_knownGene32_12.unmapped -multiple
liftOver cds_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_cds_knownGene32_exons.bed RepBro_cds_knownGene32_exons.unmapped -multiple
liftOver utr_knownGene32_exons.bed ../lift/hg38_to_hg38reps.over.chain RepBro_utr_knownGene32_exons.bed RepBro_utr_knownGene32_exons.unmapped -multiple
#fix thickstart and thickstop, coding exons are all thick, non-coding exons are all not thick (exons that had both portions were split in two previously).
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_ncRNA_knownGene32_12.bed > t1.bed
awk -v OFS="\t" '{$7=$2;$8=$3;$9=0; print $0}' RepBro_cds_knownGene32_exons.bed > t2.bed
awk -v OFS="\t" '{$7=$2;$8=$2;$9=0; print $0}' RepBro_utr_knownGene32_exons.bed > t3.bed
#add additional fields
join -1 4 -2 1 <(sort -k4 RepBro_ncRNA_knownGene32_12.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_ncRNA_knownGene32_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_cds_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$4,$1,$7}'| sort | uniq | bedtools sort > RepBro_cds_knownGene32_exons_14.bed
join -1 4 -2 1 <(sort -k4 RepBro_utr_knownGene32_exons.bed) <(sort -k1 knownGene32_keyfile.txt | cut -f 1,2,7) | awk -v OFS="\t" '{print $2,$3,$4,$8,"0", $6,$3,$3,$1,$7}' | sort | uniq | bedtools sort > RepBro_utr_knownGene32_exons_14.bed
#grouping by the name of the gene let's us collapse isoforms of the same gene that have the same genomic portion mapping to the exact same consensus position into one display record.
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_cds_knownGene32_exons_14.bed > collapse_RepBro_cds_knownGene32_exons_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_ncRNA_knownGene32_14.bed > collapse_RepBro_ncRNA_knownGene32_14.bed
bedtools groupby -g 1,2,3,4,5,6,7,8 -o collapse -c 9,10 -i RepBro_utr_knownGene32_exons_14.bed > collapse_RepBro_utr_knownGene32_exons_14.bed
#combine transcripts with same name and RepeatBrowser coordinate to make output easier to read
bedToBigBed collapse_RepBro_cds_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_cds.bb -type=bed8+2
bedToBigBed collapse_RepBro_ncRNA_knownGene32_14.bed ../hg38reps/hg38reps.sizes gencode_ncRNA.bb -type=bed8+2
bedToBigBed collapse_RepBro_utr_knownGene32_exons_14.bed ../hg38reps/hg38reps.sizes gencode_utr.bb -type=bed8+2