List Info

Thread: pmap.c hacking...




pmap.c hacking...
country flaguser name
Canada
2007-03-28 21:47:38
Hi

In an effort to reduce the amount of time spent in
"system" when 
doing a pkgsrc build, I've been mucking around with pmap.c
on amd64.  
One of the changes I've made is to have one page of 
pmap_tlb_shootdown_job's per CPU.  This eliminates the
global variable 
used to hold the "free pool" of such jobs, and
also eliminates all 
the locking contention associated with that free pool.  From
the 
testing I've done, performance for building a set of 166
packages 
('time make -k -j 8') goes from this (baseline):
 
 18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io
3371pf+57w

to this (when just doubling the number of shootdown job
slots available 
per CPU):

 18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io
3374pf+0w

to this (double the number of job slots, and eliminate the
global 
"free queue"):

 17941.4u 20939.2s 2:43:56.35 395.2% 0+0k 6048+2639046io
6331pf+0w

with my most recent tweaks to pmap.c (included here).

Comments are more than welcome (yes, the diff would need to
be 
cleaned up before it could be checked in  )  A
review by someone
more pmap/amd64-savvy would certainly be appreciated.   I suspect

similar changes could be made to the i386 pmap.c. (I just
havn't made 
time to make the changes there too and do testing)

Later...

Greg Oster



  
Re: pmap.c hacking...
country flaguser name
United States
2007-03-28 22:56:29
On Mar 28, 2007, at 7:47 PM, Greg Oster wrote:

> In an effort to reduce the amount of time spent in
"system" when
> doing a pkgsrc build, I've been mucking around with
pmap.c on amd64.
> One of the changes I've made is to have one page of
> pmap_tlb_shootdown_job's per CPU.  This eliminates the
global variable
> used to hold the "free pool" of such jobs,
and also eliminates all
> the locking contention associated with that free pool. 
From the
> testing I've done, performance for building a set of
166 packages
> ('time make -k -j 8') goes from this (baseline):

...

Cool.

This reminds me -- I really ought to change pool_cache to
support per- 
CPU caches.  For another day...

Anyway, I think this is probably OK for now, but we should
certainly  
revisit TLB shootdown in general.


>
>  18925.7u 25439.2s 3:05:03.98 399.5% 0+0k 229+2870194io
3371pf+57w
>
> to this (when just doubling the number of shootdown job
slots  
> available
> per CPU):
>
>  18776.5u 23900.5s 2:56:29.96 402.9% 0+0k 292+2864111io
3374pf+0w
>
> to this (double the number of job slots, and eliminate
the global
> "free queue"):
>
>  17941.4u 20939.2s 2:43:56.35 395.2% 0+0k
6048+2639046io 6331pf+0w
>
> with my most recent tweaks to pmap.c (included here).
>
> Comments are more than welcome (yes, the diff would
need to be
> cleaned up before it could be checked in  )  A
review by someone
> more pmap/amd64-savvy would certainly be appreciated.
  I
suspect
> similar changes could be made to the i386 pmap.c. (I
just havn't made
> time to make the changes there too and do testing)
>
> Later...
>
> Greg Oster
>
>
> --- pmap.c.orig	2007-03-27 09:17:49.000000000 -0600
> +++ pmap.c	2007-03-28 15:49:22.000000000 -0600
>  -359,19 +359,24  struct
pmap_tlb_shootdown_q {
>  	__cpu_simple_lock_t pq_slock;	/* spin lock on queue
*/
>  	int pq_flushg;		/* pending flush global */
>  	int pq_flushu;		/* pending flush user */
> +	struct pmap_tlb_shootdown_job *pj_free;
> +	union pmap_tlb_shootdown_job_al *pj_page;
>  } pmap_tlb_shootdown_q[X86_MAXPROCS];
>
> +#if 0
>  #define	PMAP_TLB_MAXJOBS	16
> +#endif
> +#define	PMAP_TLB_MAXJOBS	32
>
>  void	pmap_tlb_shootdown_q_drain __P((struct
pmap_tlb_shootdown_q *));
>  struct pmap_tlb_shootdown_job
*pmap_tlb_shootdown_job_get
>  	    __P((struct pmap_tlb_shootdown_q *));
>  void	pmap_tlb_shootdown_job_put __P((struct
pmap_tlb_shootdown_q *,
>  	    struct pmap_tlb_shootdown_job *));
> -
> +#if 0
>  __cpu_simple_lock_t pmap_tlb_shootdown_job_lock;
>  union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
> -
> +#endif
>  /*
>   * global data structures
>   */
>  -1089,7 +1094,9  pmap_bootstrap(kva_start)
>  	 * Initialize the TLB shootdown queues.
>  	 */
>
> +#if 0
> 
	__cpu_simple_lock_init(&pmap_tlb_shootdown_job_lock);
> +#endif
>
>  	for (i = 0; i < X86_MAXPROCS; i++) {
>  		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
>  -1148,6 +1155,9 
pmap_prealloc_lowmem_ptps(void)
>  void
>  pmap_init()
>  {
> +	struct pmap_tlb_shootdown_q *pq;
> +	struct cpu_info *ci;
> +	CPU_INFO_ITERATOR cii;
>  	int npages, lcv, i;
>  	vaddr_t addr;
>  	vsize_t s;
>  -1212,17 +1222,20  pmap_init()
>
>  	pv_nfpvents = 0;
>
> -	pj_page = (void *)uvm_km_alloc(kernel_map, PAGE_SIZE,
0,  
> UVM_KMF_WIRED);
> -	if (pj_page == NULL)
> -		panic("pmap_init: pj_page");
> -
> -	for (i = 0;
> -	     i < (PAGE_SIZE / sizeof (union
pmap_tlb_shootdown_job_al) - 1);
> -	     i++)
> -		pj_page[i].pja_job.pj_nextfree = &pj_page[i +
1].pja_job;
> -	pj_page[i].pja_job.pj_nextfree = NULL;
> -	pj_free = &pj_page[0];
> +	for (CPU_INFO_FOREACH(cii, ci)) {
> +		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
> +		pq->pj_page = (void *)uvm_km_alloc(kernel_map,
PAGE_SIZE,
> +						   0, UVM_KMF_WIRED);
> +		if (pq->pj_page == NULL)
> +			panic("pmap_init: pj_page");
>
> +		for (i = 0;
> +		     i < (PAGE_SIZE / sizeof (union
pmap_tlb_shootdown_job_al) -  
> 1);
> +		     i++)
> +			pq->pj_page[i].pja_job.pj_nextfree =
&pq->pj_page[i + 1].pja_job;
> +		pq->pj_page[i].pja_job.pj_nextfree = NULL;
> +		pq->pj_free = (struct pmap_tlb_shootdown_job
*)&(pq->pj_page[0]);
> +	}
>  	/*
>  	 * done: pmap module is up (and ready for business)
>  	 */
>  -3660,7 +3673,6 
pmap_do_tlb_shootdown(struct cpu_info *s
>  			if ((!pq->pq_flushu &&
pmap_is_curpmap(pj->pj_pmap)) ||
>  			    (pj->pj_pte & pmap_pg_g))
>  				pmap_update_pg(pj->pj_va);
> -
>  			pmap_tlb_shootdown_job_put(pq, pj);
>  		}
>
>  -3717,22 +3729,12 
pmap_tlb_shootdown_job_get(pq)
>  	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
>  		return (NULL);
>
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -	if (pj_free == NULL) {
> -#ifdef MULTIPROCESSOR
>
-		__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> +	if (pq->pj_free == NULL) {
>  		return NULL;
> +	} else {
> +		pj = pq->pj_free;
> +		pq->pj_free = pj->pj_nextfree;
>  	}
> -	pj = &pj_free->pja_job;
> -	pj_free =
> -	    (union pmap_tlb_shootdown_job_al
*)pj_free->pja_job.pj_nextfree;
> -#ifdef MULTIPROCESSOR
>
-	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
>  	pq->pq_count++;
>  	return (pj);
>  }
>  -3754,14 +3756,7 
pmap_tlb_shootdown_job_put(pq, pj)
>  	if (pq->pq_count == 0)
>  		panic("pmap_tlb_shootdown_job_put: queue length
inconsistency");
>  #endif
> -#ifdef MULTIPROCESSOR
> -	__cpu_simple_lock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -	pj->pj_nextfree = &pj_free->pja_job;
> -	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
> -#ifdef MULTIPROCESSOR
>
-	__cpu_simple_unlock(&pmap_tlb_shootdown_job_lock);
> -#endif
> -
> +	pj->pj_nextfree = pq->pj_free;
> +	pq->pj_free = pj;
>  	pq->pq_count--;
>  }

-- thorpej


[1-2]

about | contact  Other archives ( Real Estate discussion Medical topics )