# Copyright (c) 2020 Brit Nicholson. # All rights reserved. This program is made available under # the terms of the Simple Non Code License (SNCL) v2.3.0, # which is available at # https://github.com/MysteryDash/Simple-Non-Code-License/blob/master/License.txt # This is the code that was used to calculate expected percentages of X-DNA shared # between a person and various ancestors. More information can be found here: # https://dna-sci.com/2020/03/23/modeling-the-inheritance-of-x-chromosome-dna/ import numpy as np import random import fnmatch def recomb_a_first(mat_anc, fun_cut_spots, pat_anc=None): mat_a = mat_anc[:,0].tolist() mat_b = mat_anc[:,1].tolist() if len(cut_spots) == 0: recomb_out = mat_a return recomb_out recomb_out = [] for i in range(0, len(fun_cut_spots)): if i == 0: recomb_out = mat_a[0:fun_cut_spots[0]] if i % 2 == 1: recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = False else: recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = True if last_a: recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)]) else: recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)]) return recomb_out def recomb_b_first(mat_anc, fun_cut_spots, pat_anc=None): mat_a = mat_anc[:,0].tolist() mat_b = mat_anc[:,1].tolist() if len(cut_spots) == 0: recomb_out = mat_b return recomb_out recomb_out = [] for i in range(0, len(fun_cut_spots)): if i == 0: recomb_out = mat_b[0:fun_cut_spots[0]] if i % 2 == 1: recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = True else: recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]]) last_a = False if last_a: recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)]) else: recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)]) return recomb_out def find_start(g): start_out = 2 for g in range(2, g + 1): if g % 2 == 1: #odd start_out = start_out*2 else: #even start_out = start_out*2 - 1 return start_out # Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc. gen = 5 # Shouldn't pick less than 14 for genes_num if λ = 1.7. If the random Poisson variable ends up larger than # genes_num, it will simply recombine in fewer places than the random variable designated (w/o error, surprisingly). # In a test of 500m trials, the largest random Poisson value I got (w/ avg. = 1.7) was 13 genes_num = 14 genes_list = list(range(0, genes_num)) spots = list(range(1, genes_num)) # λ rand_fish = 1.7 trials = 500000 # Mat. is the same only shifted back one mat_start_num = find_start(gen-1) pat_start_num = find_start(gen) # Need to name output variables in order to later add percentages for each trial run # Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0: m2_0_in_common_w_p, m2_0_in_common_w_m, p1_0_in_common_w_p, m2_0_in_common_w_p, p1_0_in_common_w_m, m2_0_in_common_w_m = ({} for i in range(6)) for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means pat. m2_0_in_common_w_p[str(num) + '_' + str(gen)] = [] else: # even means mat. m2_0_in_common_w_m[str(num) + '_' + str(gen)] = [] # Ones who could be anc. of both m2_0 and p1_0 for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means pat. p1_0_in_common_w_p[str(num) + '_' + str(gen)] = [] m2_0_in_common_w_p[str(num) + '_' + str(gen)] = [] else: # even means mat. p1_0_in_common_w_m[str(num) + '_' + str(gen)] = [] m2_0_in_common_w_m[str(num) + '_' + str(gen)] = [] p, m = {}, {} # Begin the trials 'for loop' (the crux of the model): for i_trials in range(0, trials): # Make the set of anc. at gen. steps back for num in range(1, 2**gen + 1): if num % 2 == 1: # Male ancestor p[(num, gen)] = ['p' + str(num) + '_' + str(i) for i in genes_list] else: # Female ancestor. Make two columns, signifying homologues homol_a = ['m' + str(num) + '_a' + str(i) for i in genes_list] homol_b = ['m' + str(num) + '_b' + str(i) for i in genes_list] m[(num, gen)] = np.column_stack((homol_a, homol_b)) random.shuffle(spots) # Populate the rest of the tree except sibs.: for g in range(gen - 1, -1, -1): for num in range(1, 2**g + 1): num_of_recombs = np.random.poisson(rand_fish) cut_spots = sorted(spots[:num_of_recombs]) rand = random.random() # Used for deciding which homologue goes first in alternating during recombination if num % 2 == 1: # Male ancestor if rand < 0.5: p[(num, g)] = recomb_a_first(m[(num*2, g+1)], cut_spots) else: p[(num, g)] = recomb_b_first(m[(num*2, g+1)], cut_spots) else: # Female ancestor if rand < 0.5: homol_a = recomb_a_first(m[(num*2, g+1)], cut_spots) else: homol_a = recomb_b_first(m[(num*2, g+1)], cut_spots) homol_b = p[(num*2-1, g+1)] m[(num, g)] = np.column_stack((homol_a, homol_b)) # Since there won't be a 2 to make m2_0, it's hard-coded as a sib. to p1_0: random.shuffle(spots) num_of_recombs = np.random.poisson(rand_fish) cut_spots = sorted(spots[:num_of_recombs]) if random.random() < 0.5: homol_a = recomb_a_first(m[(2, 1)], cut_spots) else: homol_a = recomb_b_first(m[(2, 1)], cut_spots) homol_b = p[(1, 1)] #s1 = np.column_stack((homol_a, homol_b)) # Checking 'in common' doesn't work if it's an array. m[(2, 0)] = homol_a + homol_b # Ok to concat. homologues because m2_0 has no descendants # Calculate percent in common with ancestors. When the final descendant is female, the # percentage is that of just one homologue. To find the percentage of her total X-DNA, # genes_num would have to be multiplied by 2, but I believe that would be misleading. # W/o multiplying by 2, it's comparable to amount of DNA rather than just percentage, # and allows comparison to male descendants' DNA amount. # Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0: for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means pat. m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num)) else: # even means mat. m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num)) # Ones who could be anc. of both m2_0 and p1_0: for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means pat. p1_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'p' + str(num) + '*')) / genes_num)) m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num)) else: # even means mat. p1_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'm' + str(num) + '*')) / genes_num)) m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num)) # 'For loop' for trials ends here import statistics # combos() and kRecur() were modified from here: # https://stackoverflow.com/questions/7074051/what-is-the-best-way-to-generate-all-possible-three-letter-strings # which was modified by alphahmed from a solution by Abhinav Ramana on GeeksforGeeks.org. def combos(letters, k): out = [] l = len(letters) kRecur(letters, "", l, k, out) return(out) def kRecur(letters, prfx, l, k, out): if k == 0: out = out.append(prfx) else: for i in range(l): newPrfx = prfx + letters[i] kRecur(letters, newPrfx, l, k-1, out) if len(out) == 2**k: return out order_labels = sorted(combos(['m','p'], gen), reverse = True) confid = 90 lo = (100 - confid) / 2 hi = 100 - lo # Print ("pv" stands for print variable) for num in range(mat_start_num, pat_start_num): if num % 2 == 1: # odd means ancestor is pat. pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') else: # even means ancestor is mat. pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') for num in range(pat_start_num, 2**gen + 1): if num % 2 == 1: # odd means ancestor is pat. pv = p1_0_in_common_w_p[str(num) + '_' + str(gen)] print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') else: # even means ancestor is mat. pv = p1_0_in_common_w_m[str(num) + '_' + str(gen)] print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%') pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)] print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,', 'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',', 'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,', pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.
Recent Comments