Select Page
```# Copyright (c) 2020 Brit Nicholson.
# the terms of the Simple Non Code License (SNCL) v2.3.0,
# which is available at

# This is the code that was used to calculate expected percentages of X-DNA shared
# between a person and various ancestors. More information can be found here:
# https://dna-sci.com/2020/03/23/modeling-the-inheritance-of-x-chromosome-dna/

import numpy as np
import random
import fnmatch

def recomb_a_first(mat_anc, fun_cut_spots, pat_anc=None):

mat_a = mat_anc[:,0].tolist()
mat_b = mat_anc[:,1].tolist()

if len(cut_spots) == 0:
recomb_out = mat_a
return recomb_out

recomb_out = []
for i in range(0, len(fun_cut_spots)):
if i == 0:
recomb_out = mat_a[0:fun_cut_spots]
if i % 2 == 1:
recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = False
else:
recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = True
if last_a:
recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
else:
recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
return recomb_out

def recomb_b_first(mat_anc, fun_cut_spots, pat_anc=None):

mat_a = mat_anc[:,0].tolist()
mat_b = mat_anc[:,1].tolist()

if len(cut_spots) == 0:
recomb_out = mat_b
return recomb_out

recomb_out = []
for i in range(0, len(fun_cut_spots)):
if i == 0:
recomb_out = mat_b[0:fun_cut_spots]
if i % 2 == 1:
recomb_out.extend(mat_a[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = True
else:
recomb_out.extend(mat_b[fun_cut_spots[i-1]:fun_cut_spots[i]])
last_a = False
if last_a:
recomb_out.extend(mat_b[fun_cut_spots[i]:len(mat_b)])
else:
recomb_out.extend(mat_a[fun_cut_spots[i]:len(mat_a)])
return recomb_out

def find_start(g):
start_out = 2
for g in range(2, g + 1):
if g % 2 == 1: #odd
start_out = start_out*2
else: #even
start_out = start_out*2 - 1
return start_out

# Gen. = 2 simulates grandparents, 3 simulates great-grandparents, etc.
gen = 5

# Shouldn't pick less than 14 for genes_num if λ = 1.7. If the random Poisson variable ends up larger than
# genes_num, it will simply recombine in fewer places than the random variable designated (w/o error, surprisingly).
# In a test of 500m trials, the largest random Poisson value I got (w/ avg. = 1.7) was 13
genes_num = 14
genes_list = list(range(0, genes_num))
spots = list(range(1, genes_num))

# λ
rand_fish = 1.7

trials = 500000

# Mat. is the same only shifted back one
mat_start_num = find_start(gen-1)
pat_start_num = find_start(gen)

# Need to name output variables in order to later add percentages for each trial run
# Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
m2_0_in_common_w_p, m2_0_in_common_w_m, p1_0_in_common_w_p, m2_0_in_common_w_p,
p1_0_in_common_w_m, m2_0_in_common_w_m = ({} for i in range(6))
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means pat.
m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
else: # even means mat.
m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []
# Ones who could be anc. of both m2_0 and p1_0
for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means pat.
p1_0_in_common_w_p[str(num) + '_' + str(gen)] = []
m2_0_in_common_w_p[str(num) + '_' + str(gen)] = []
else: # even means mat.
p1_0_in_common_w_m[str(num) + '_' + str(gen)] = []
m2_0_in_common_w_m[str(num) + '_' + str(gen)] = []

p, m = {}, {}
# Begin the trials 'for loop' (the crux of the model):
for i_trials in range(0, trials):

# Make the set of anc. at gen. steps back
for num in range(1, 2**gen + 1):
if num % 2 == 1: # Male ancestor
p[(num, gen)] = ['p' + str(num) + '_' + str(i) for i in genes_list]
else: # Female ancestor. Make two columns, signifying homologues
homol_a = ['m' + str(num) + '_a' + str(i) for i in genes_list]
homol_b = ['m' + str(num) + '_b' + str(i) for i in genes_list]
m[(num, gen)] = np.column_stack((homol_a, homol_b))

random.shuffle(spots)

# Populate the rest of the tree except sibs.:
for g in range(gen - 1, -1, -1):
for num in range(1, 2**g + 1):

num_of_recombs = np.random.poisson(rand_fish)
cut_spots = sorted(spots[:num_of_recombs])
rand = random.random() # Used for deciding which homologue goes first in alternating during recombination
if num % 2 == 1: # Male ancestor
if rand < 0.5:
p[(num, g)] = recomb_a_first(m[(num*2, g+1)], cut_spots)
else:
p[(num, g)] = recomb_b_first(m[(num*2, g+1)], cut_spots)
else: # Female ancestor
if rand < 0.5:
homol_a = recomb_a_first(m[(num*2, g+1)], cut_spots)
else:
homol_a = recomb_b_first(m[(num*2, g+1)], cut_spots)
homol_b = p[(num*2-1, g+1)]
m[(num, g)] = np.column_stack((homol_a, homol_b))

# Since there won't be a 2 to make m2_0, it's hard-coded as a sib. to p1_0:
random.shuffle(spots)
num_of_recombs = np.random.poisson(rand_fish)
cut_spots = sorted(spots[:num_of_recombs])
if random.random() < 0.5:
homol_a = recomb_a_first(m[(2, 1)], cut_spots)
else:
homol_a = recomb_b_first(m[(2, 1)], cut_spots)
homol_b = p[(1, 1)]
#s1 = np.column_stack((homol_a, homol_b)) # Checking 'in common' doesn't work if it's an array.
m[(2, 0)] = homol_a + homol_b # Ok to concat. homologues because m2_0 has no descendants

# Calculate percent in common with ancestors. When the final descendant is female, the
# percentage is that of just one homologue. To find the percentage of her total X-DNA,
# genes_num would have to be multiplied by 2, but I believe that would be misleading.
# W/o multiplying by 2, it's comparable to amount of DNA rather than just percentage,
# and allows comparison to male descendants' DNA amount.
# Just the ones who could be anc. of m2_0, excluding those who could be anc. of p1_0:
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means pat.
m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
else: # even means mat.
m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
# Ones who could be anc. of both m2_0 and p1_0:
for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means pat.
p1_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'p' + str(num) + '*')) / genes_num))
m2_0_in_common_w_p[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'p' + str(num) + '*')) / genes_num))
else: # even means mat.
p1_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(p[(1, 0)], 'm' + str(num) + '*')) / genes_num))
m2_0_in_common_w_m[str(num) + '_' + str(gen)].append(round(100*len(fnmatch.filter(m[(2, 0)], 'm' + str(num) + '*')) / genes_num))
# 'For loop' for trials ends here

import statistics

# combos() and kRecur() were modified from here:
# https://stackoverflow.com/questions/7074051/what-is-the-best-way-to-generate-all-possible-three-letter-strings
# which was modified by alphahmed from a solution by Abhinav Ramana on GeeksforGeeks.org.
def combos(letters, k):
out = []
l = len(letters)
kRecur(letters, "", l, k, out)
return(out)

def kRecur(letters, prfx, l, k, out):
if k == 0:
out = out.append(prfx)
else:
for i in range(l):
newPrfx = prfx + letters[i]
kRecur(letters, newPrfx, l, k-1, out)
if len(out) == 2**k:
return out

order_labels = sorted(combos(['m','p'], gen), reverse = True)

confid = 90
lo = (100 - confid) / 2
hi = 100 - lo

# Print ("pv" stands for print variable)
for num in range(mat_start_num, pat_start_num):
if num % 2 == 1: # odd means ancestor is pat.
pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
else: # even means ancestor is mat.
pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

for num in range(pat_start_num, 2**gen + 1):
if num % 2 == 1: # odd means ancestor is pat.
pv = p1_0_in_common_w_p[str(num) + '_' + str(gen)]
print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')
pv = m2_0_in_common_w_p[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

else: # even means ancestor is mat.
pv = p1_0_in_common_w_m[str(num) + '_' + str(gen)]
print('p_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')

pv = m2_0_in_common_w_m[str(num) + '_' + str(gen)]
print('m_0 w/', order_labels[num-1] + ':', str(round(statistics.mean(pv),2)) + '%,',
'min:', str(round(min(pv),2)) + ',', '90%-Conf.:', str(np.percentile(pv, lo)) + '-' + str(np.percentile(pv, hi)) + ',',
'max:', str(round(max(pv),2)) + ',', round(statistics.stdev(pv),1), 'std,',
pv.count(0) / len(pv), '0%', pv.count(100) / len(pv), '100%')```

Feel free to ask me about modeling & simulation, genetic genealogy, or genealogical research. To see my articles on Medium, click here. And try out a nifty calculator that’s based on the first of my three genetic models. It lets you find the amount of an ancestor’s DNA you have when combined with various relatives. And most importantly, check out these ranges of shared DNA percentages or shared centiMorgans, which are the only published values that match known standard deviations.