# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # Training model for determining the average number of "segments" passed from a simulated # parent to a child based on the known standard deviation between siblings. import numpy as np import random import math import pandas as pd # The below function allows a "child" to "inherit" 50% of each of his/her parents' DNA # The halved # of segments that a parent has available is passed as the 1st arg. # The labeled "DNA segments" of the first parent are passed as the 2nd arg. # An optional argument is the labeled segments of the 2nd parent. This is always used here. def recomb(fun_half_segs, first_anc, second_anc=None): # Half of the 1st parent's segments are passed out = random.sample(first_anc, fun_half_segs) if second_anc != None: # Half of the 2nd parent's segments are passed (added to the list) out.extend(random.sample(second_anc, fun_half_segs)) return out # Initialize the number of segments segs = 96.95 # Assuming std_dev is just under 0.036. # Doesn't really matter if it's + or - at first. It will correct itself. difference = -0.0001 segs_out = [] mean_out = [] std_dev_out = [] # The simulation appears to keep running with the value: 0.000001. # With one less zero it would sometimes exit the while loop # when the number of segments was around 97.5 while abs(difference) > 0.000001: if difference > 0: # If std_dev is high, the # of segs. needs to be increased # Got too much variability with 0.001, even w/ 200,401 sib_pairs segs += 0.05 else: # If std_dev is low, the # of segs. needs to be decreased segs -= 0.05 percent_same = [] # This is how many trials will be done to compair a pair of siblings sib_pairs = 200401 for i in range(0, sib_pairs): # Only the decimal portion remaining after halving the segments. # This allows #s other than even whole integer #s of segments. seg_decimal = segs/2 - int(segs/2) if np.random.random() > seg_decimal: # If the decimal is 0.75, round down the halved # of segments 25% of the time. half_segs = math.floor(segs/2) else: # If the decimal is 0.75, round up the halved # of segments 75% of the time. half_segs = math.ceil(segs/2) # Make labeled "segments" for maternal and paternal ancestors (parents here): m = list(range(0, half_segs*2)) p = list(m) m = ['m' + str(i) for i in m] p = ['p' + str(i) for i in p] # Call the function for the first sibling s1 = recomb(half_segs, m, p) # Call the function for the second sibling s2 = recomb(half_segs, m, p) # Take the unique intersection (shared segments) of the siblings in_common = list(set(s1) & set(s2)) # Calculate the fraction of segments per sibling that is shared percent_same.append(len(in_common)/(2*half_segs)) # Find the standard deviation of all of the shared fractions std_dev = np.std(percent_same) # Find if the model overshot or undershot, for future correction difference = std_dev - 0.036 segs_out.append(segs) mean_out.append(np.mean(percent_same)) std_dev_out.append(std_dev) print(segs) print(std_dev) #print(min(percent_same)) #print(max(percent_same))
Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.
Recent Comments