A Python Scripts
A.1 CreateSubgroups.py
Script:
# /data/cjgwx7/tml_gpsm/scripts/python/CreateSubgroups.py
import pandas as pd
### Specify paths to necessary files for import
= "../../raw_data/level1/genotypes.txt"
path_genotypes = "../../intmd/pedigree_map/ped_plus_intmd.txt"
path_ped_plus_intmd
### Specify paths for file export
= "../../raw_data/level2/c.txt"
path_s1 = "../../raw_data/level2/d.txt"
path_s2 = "../../raw_data/level2/l.txt"
path_s3 = "../../raw_data/level2/y.txt"
path_s4 = "../../raw_data/level2/cd.txt"
path_s5 = "../../raw_data/level2/cl.txt"
path_s6 = "../../raw_data/level2/cy.txt"
path_s7 = "../../raw_data/level2/dl.txt"
path_s8 = "../../raw_data/level2/dy.txt"
path_s9 = "../../raw_data/level2/ly.txt"
path_s10 = "../../raw_data/level2/dly.txt"
path_s11 = "../../raw_data/level2/cdly.txt"
path_s12
### Specify column names for input files
= ['PigID', 'SNPs']
colnames_genotypes = ['PigID', 'Line']
colnames_ped_plus_intmd
### Read in genotype file and replace commas between SNP haplotypes
= pd.read_csv(path_genotypes,
genotypes = "\t",
sep = colnames_genotypes,
names = None)
header
= genotypes.replace(',',
genotypes '',
=True)
regex
### Read in ped_plus_intmd
= pd.read_csv(path_ped_plus_intmd,
ped_plus_intmd = " ",
sep = colnames_ped_plus_intmd,
names = None,
header = [0, 6])
usecols
print("Data Imported Successfully")
### Merge genotype and ped_plus_intmd to connect PigID to Line information
= pd.merge(genotypes,
genotypes_plus
ped_plus_intmd, = 'left',
how = 'PigID')
on = genotypes_plus[['PigID', 'Line']]
genotypes_plus = genotypes_plus.dropna()
genotypes_plus
### Create subgroups
## Crossbred
= genotypes_plus[genotypes_plus.Line == 9006]
c
## Duroc
= genotypes_plus[genotypes_plus.Line == 1006]
d
## Landrace
= genotypes_plus[genotypes_plus.Line == 10]
l
## Yorkshire
= genotypes_plus[genotypes_plus.Line == 11]
y
## Crossbred and Duroc
= genotypes_plus
cd 'f'] = cd.Line.isin([9006, 1006])
cd[= cd[cd.f == True]
cd = cd.drop(columns=['f'])
cd
## Crossbred and Landrace
= genotypes_plus
cl 'f'] = cl.Line.isin([9006, 10])
cl[= cl[cl.f == True]
cl = cl.drop(columns=['f'])
cl
## Crossbred and Yorkshire
= genotypes_plus
cy 'f'] = cy.Line.isin([9006, 11])
cy[= cy[cy.f == True]
cy = cy.drop(columns=['f'])
cy
## Duroc and Landrace
= genotypes_plus
dl 'f'] = dl.Line.isin([1006, 10])
dl[= dl[dl.f == True]
dl = dl.drop(columns=['f'])
dl
## Duroc and Yorkshire
= genotypes_plus
dy 'f'] = dy.Line.isin([1006, 11])
dy[= dy[dy.f == True]
dy = dy.drop(columns=['f'])
dy
## Landrace and Yorkshire
= genotypes_plus
ly 'f'] = ly.Line.isin([10, 11])
ly[= ly[ly.f == True]
ly = ly.drop(columns=['f'])
ly
## Duroc, Landrace, and Yorkshire
= genotypes_plus
dly 'f'] = dly.Line.isin([1006, 10, 11])
dly[= dly[dly.f == True]
dly = dly.drop(columns=['f'])
dly
## Crossbred, Duroc, Landrace, and Yorkshire
= genotypes_plus
cdly 'f'] = cdly.Line.isin([9006, 1006, 10, 11])
cdly[= cdly[cdly.f == True]
cdly = cdly.drop(columns=['f'])
cdly
print("Subgroups Created Successfully")
### Merge subgroup IDs with genotypes
= pd.merge(c['PigID'], genotypes, how='left', on='PigID')
c = pd.merge(d['PigID'], genotypes, how='left', on='PigID')
d = pd.merge(l['PigID'], genotypes, how='left', on='PigID')
l = pd.merge(y['PigID'], genotypes, how='left', on='PigID')
y = pd.merge(cd['PigID'], genotypes, how='left', on='PigID')
cd = pd.merge(cl['PigID'], genotypes, how='left', on='PigID')
cl = pd.merge(cy['PigID'], genotypes, how='left', on='PigID')
cy = pd.merge(dl['PigID'], genotypes, how='left', on='PigID')
dl = pd.merge(dy['PigID'], genotypes, how='left', on='PigID')
dy = pd.merge(ly['PigID'], genotypes, how='left', on='PigID')
ly = pd.merge(dly['PigID'], genotypes, how='left', on='PigID')
dly = pd.merge(cdly['PigID'], genotypes, how='left', on='PigID')
cdly
print("Genotype File Merged Successfully")
### Check file length (Number of pigs before quality control)
print("The length of Subgroup 1 is:", len(c))
print("The length of Subgroup 2 is:", len(d))
print("The length of Subgroup 3 is:", len(l))
print("The length of Subgroup 4 is:", len(y))
print("The length of Subgroup 5 is:", len(cd))
print("The length of Subgroup 6 is:", len(cl))
print("The length of Subgroup 7 is:", len(cy))
print("The length of Subgroup 8 is:", len(dl))
print("The length of Subgroup 9 is:", len(dy))
print("The length of Subgroup 10 is:", len(ly))
print("The length of Subgroup 11 is:", len(dly))
print("The length of Subgroup 12 is:", len(cdly))
print("Starting output of subgroup files")
= "\t", index = False, header = False)
c.to_csv(path_s1, sep = "\t", index = False, header = False)
d.to_csv(path_s2, sep = "\t", index = False, header = False)
l.to_csv(path_s3, sep = "\t", index = False, header = False)
y.to_csv(path_s4, sep = "\t", index = False, header = False)
cd.to_csv(path_s5, sep = "\t", index = False, header = False)
cl.to_csv(path_s6, sep = "\t", index = False, header = False)
cy.to_csv(path_s7, sep = "\t", index = False, header = False)
dl.to_csv(path_s8, sep = "\t", index = False, header = False)
dy.to_csv(path_s9, sep = "\t", index = False, header = False)
ly.to_csv(path_s10, sep = "\t", index = False, header = False)
dly.to_csv(path_s11, sep = "\t", index = False, header = False)
cdly.to_csv(path_s12, sep
print("Genotype files for each subgroup successfully written")
Log:
Data Imported Successfully
Subgroups Created Successfully
Genotype File Merged Successfully
The length of Subgroup 1 is: 8532
The length of Subgroup 2 is: 16802
The length of Subgroup 3 is: 19342
The length of Subgroup 4 is: 18368
The length of Subgroup 5 is: 25334
The length of Subgroup 6 is: 27874
The length of Subgroup 7 is: 26900
The length of Subgroup 8 is: 36144
The length of Subgroup 9 is: 35170
The length of Subgroup 10 is: 37710
The length of Subgroup 11 is: 54512
The length of Subgroup 12 is: 63044
Starting output of subgroup files
Genotype files for each subgroup successfully written