// CSE 142, Homework 7 (DNA) // This helper program can read NCBI genetic data files with // .fna and .ptt extensions and preprocess them to create input // files suitable for use with the HW7 Dna program. import java.io.*; import java.util.*; import java.util.Scanner; public class ConvertDna { // % of characters to randomly make lowercase public static final int PERCENT_LOWERCASE = 10; // % of lines to have as proteins (the rest are random DNA) public static final int PERCENT_PROTEINS = 66; public static void main(String[] args) throws IOException { Scanner console = new Scanner(System.in); System.out.print("Genome (.fna) file? "); Scanner fnaInput = new Scanner(new File(console.nextLine())); System.out.println("Reading .fna file..."); StringBuilder sb = new StringBuilder(4000000); while (fnaInput.hasNextLine()) { sb.append(fnaInput.nextLine()); } System.out.print("Protein (.ptt) file? "); Scanner pttInput = new Scanner(new File(console.nextLine())); System.out.print("Output file? "); PrintStream out = new PrintStream(new File(console.nextLine())); System.out.print("How many proteins (-1 for all)? "); int proteins = console.nextInt(); System.out.println("Producing protein output..."); readProtein(pttInput, proteins, sb, out); } public static void readProtein(Scanner pttInput, int proteins, StringBuilder sb, PrintStream out) { pttInput.nextLine(); // skip header lines pttInput.nextLine(); pttInput.nextLine(); Random rand = new Random(42); while (proteins != 0 && pttInput.hasNextLine()) { String line = pttInput.nextLine(); Scanner lineScan = new Scanner(line); lineScan.useDelimiter("[ \t\n\f\r:.]+"); int start = lineScan.nextInt(); int end = lineScan.nextInt(); String strand = lineScan.next(); // "+" or "-" if (strand.equals("+")) { lineScan.next(); // skip length token lineScan.next(); // skip PID token lineScan.next(); // skip gene token lineScan.next(); // skip synonym token lineScan.next(); // skip code token lineScan.next(); // skip COG token String name = lineScan.next(); while (lineScan.hasNext()) { name += " " + lineScan.next(); } if (rand.nextInt(100) > PERCENT_PROTEINS) { // grab some random dna start = rand.nextInt(sb.length() - 3) + 1; end = Math.min(start + 3 + 3 * rand.nextInt(100), sb.length()) - 1; name = "Non-protein region"; } int length = end - start + 1; out.println(name); StringBuilder range = new StringBuilder(sb.substring(start - 1, end)); // pseudo-randomly change casing of 10% of nucleotides for (int i = 0; i < length * PERCENT_LOWERCASE / 100; i++) { int index = rand.nextInt(length); range.setCharAt(index, Character.toLowerCase(range.charAt(index))); } out.println(range); proteins--; } } } }