From 9bb7bc942d3da606f184ac6a4dfc7e4d470c831b Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 30 May 2005 15:35:26 -0700 Subject: [PATCH 01/11] [NETFILTER]: Fix deadlock with ip_queue and tcp local input path. When we have ip_queue being used from LOCAL_IN, then we end up with a situation where the verdicts coming back from userspace traverse the TCP input path from syscall context. While this seems to work most of the time, there's an ugly deadlock: syscall context is interrupted by the timer interrupt. When the timer interrupt leaves, the timer softirq get's scheduled and calls tcp_delack_timer() and alike. They themselves do bh_lock_sock(sk), which is already held from somewhere else -> boom. I've now tested the suggested solution by Patrick McHardy and Herbert Xu to simply use local_bh_{en,dis}able(). Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index e5746b674413..eda1fba431a4 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -3,6 +3,7 @@ * communicating with userspace via netlink. * * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +18,7 @@ * 2005-01-10: Added /proc counter for dropped packets; fixed so * packets aren't delivered to user space if they're going * to be dropped. + * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) * */ #include @@ -71,7 +73,15 @@ static DECLARE_MUTEX(ipqnl_sem); static void ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) { + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like ipq_issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + kfree(entry); } From 208d89843b7b03978d8e748b8b991c1be81c4f43 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 30 May 2005 15:50:15 -0700 Subject: [PATCH 02/11] [IPV4]: Fix BUG() in 2.6.x, udp_poll(), fragments + CONFIG_HIGHMEM Steven Hand wrote: > > Reconstructed forward trace: > > net/ipv4/udp.c:1334 spin_lock_irq() > net/ipv4/udp.c:1336 udp_checksum_complete() > net/core/skbuff.c:1069 skb_shinfo(skb)->nr_frags > 1 > net/core/skbuff.c:1086 kunmap_skb_frag() > net/core/skbuff.h:1087 local_bh_enable() > kernel/softirq.c:0140 WARN_ON(irqs_disabled()); The receive queue lock is never taken in IRQs (and should never be) so we can simply substitute bh for irq. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a6952e3fee9..7c24e64b443f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) unsigned long amount; amount = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* @@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) */ amount = skb->len - sizeof(struct udphdr); } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } @@ -848,12 +848,12 @@ static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Clear queue. */ if (flags&MSG_PEEK) { int clear = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); if (skb == skb_peek(&sk->sk_receive_queue)) { __skb_unlink(skb, &sk->sk_receive_queue); clear = 1; } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); if (clear) kfree_skb(skb); } @@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; - spin_lock_irq(&rcvq->lock); + spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL) { if (udp_checksum_complete(skb)) { UDP_INC_STATS_BH(UDP_MIB_INERRORS); @@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) break; } } - spin_unlock_irq(&rcvq->lock); + spin_unlock_bh(&rcvq->lock); /* nothing to see, move along */ if (skb == NULL) From 0451eb074eef30240c6c06dacf2911bee26831e1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:15:58 -0700 Subject: [PATCH 03/11] [PKT_SCHED]: Fix dsmark to count ignored indices while walking Unused indices which are ignored while walking must still be counted to avoid dumping the same index twice. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 8a3db9d95bab..acbe9d2b3e15 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -163,14 +163,15 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { if (p->mask[i] == 0xff && !p->value[i]) - continue; + goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i+1, walker) < 0) { walker->stop = 1; break; } } - walker->count++; +ignore: + walker->count++; } } From 486b53e59ca8cd07d91ad88375c1c884b15cc9bd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:16:52 -0700 Subject: [PATCH 04/11] [PKT_SCHED]: make dsmark try using pfifo instead of noop while grafting Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index acbe9d2b3e15..a8c948f62978 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,8 +73,13 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg, DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, old); - if (!new) - new = &noop_qdisc; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + sch_tree_lock(sch); *old = xchg(&p->q,new); if (*old) From 08e9cd1fc559c00bc05df3fc551efe3b87c57ee3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:17:28 -0700 Subject: [PATCH 05/11] [PKT_SCHED]: Disable dsmark debugging messages by default Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index a8c948f62978..d8bd2a569c7c 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -18,7 +18,7 @@ #include -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) #else #define DPRINTK(format,args...) From 7c963ad1d113790a8c723a178988b675868f3abe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 May 2005 16:57:59 -0700 Subject: [PATCH 06/11] [SPARC64]: Fix streaming buffer flushing on PCI and SBUS. Firstly, if the direction is TODEVICE, then dirty data in the streaming cache is impossible so we can elide the flush-flag synchronization in that case. Next, the context allocator is broken. It is highly likely that contexts get used multiple times for different dma mappings, which confuses the strbuf flushing code and makes it run inefficiently. Signed-off-by: David S. Miller --- arch/sparc64/kernel/pci_iommu.c | 90 +++++++++++++++++++++++++------- arch/sparc64/kernel/pci_psycho.c | 2 +- arch/sparc64/kernel/pci_sabre.c | 2 +- arch/sparc64/kernel/pci_schizo.c | 2 +- arch/sparc64/kernel/sbus.c | 20 ++++--- include/asm-sparc64/iommu.h | 2 + include/asm-sparc64/pbm.h | 8 +-- 7 files changed, 94 insertions(+), 32 deletions(-) diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c index 33ca56c90da2..1807876f8c36 100644 --- a/arch/sparc64/kernel/pci_iommu.c +++ b/arch/sparc64/kernel/pci_iommu.c @@ -196,6 +196,34 @@ static iopte_t *alloc_consistent_cluster(struct pci_iommu *iommu, unsigned long return NULL; } +static int iommu_alloc_ctx(struct pci_iommu *iommu) +{ + int lowest = iommu->ctx_lowest_free; + int sz = IOMMU_NUM_CTXS - lowest; + int n = find_next_zero_bit(iommu->ctx_bitmap, sz, lowest); + + if (unlikely(n == sz)) { + n = find_next_zero_bit(iommu->ctx_bitmap, lowest, 1); + if (unlikely(n == lowest)) { + printk(KERN_WARNING "IOMMU: Ran out of contexts.\n"); + n = 0; + } + } + if (n) + __set_bit(n, iommu->ctx_bitmap); + + return n; +} + +static inline void iommu_free_ctx(struct pci_iommu *iommu, int ctx) +{ + if (likely(ctx)) { + __clear_bit(ctx, iommu->ctx_bitmap); + if (ctx < iommu->ctx_lowest_free) + iommu->ctx_lowest_free = ctx; + } +} + /* Allocate and map kernel buffer of size SIZE using consistent mode * DMA for PCI device PDEV. Return non-NULL cpu-side address if * successful and set *DMA_ADDRP to the PCI side dma address. @@ -236,7 +264,7 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_ad npages = size >> IO_PAGE_SHIFT; ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); first_page = __pa(first_page); while (npages--) { iopte_val(*iopte) = (IOPTE_CONSISTENT(ctx) | @@ -317,6 +345,8 @@ void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_ } } + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); order = get_order(size); @@ -360,7 +390,7 @@ dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direct base_paddr = __pa(oaddr & IO_PAGE_MASK); ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); if (strbuf->strbuf_enabled) iopte_protection = IOPTE_STREAMING(ctx); else @@ -380,39 +410,55 @@ dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direct return PCI_DMA_ERROR_CODE; } -static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages) +static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages, int direction) { int limit; - PCI_STC_FLUSHFLAG_INIT(strbuf); if (strbuf->strbuf_ctxflush && iommu->iommu_ctxflush) { unsigned long matchreg, flushreg; + u64 val; flushreg = strbuf->strbuf_ctxflush; matchreg = PCI_STC_CTXMATCH_ADDR(strbuf, ctx); - limit = 100000; + if (pci_iommu_read(matchreg) == 0) + goto do_flush_sync; + pci_iommu_write(flushreg, ctx); - for(;;) { - if (((long)pci_iommu_read(matchreg)) >= 0L) - break; - limit--; - if (!limit) - break; - udelay(1); + if ((val = pci_iommu_read(matchreg)) == 0) + goto do_flush_sync; + + val &= 0xffff; + while (val) { + if (val & 0x1) + pci_iommu_write(flushreg, ctx); + val >>= 1; } - if (!limit) + val = pci_iommu_read(matchreg); + if (unlikely(val)) { printk(KERN_WARNING "pci_strbuf_flush: ctx flush " - "timeout vaddr[%08x] ctx[%lx]\n", - vaddr, ctx); + "timeout matchreg[%lx] ctx[%lx]\n", + val, ctx); + goto do_page_flush; + } } else { unsigned long i; + do_page_flush: for (i = 0; i < npages; i++, vaddr += IO_PAGE_SIZE) pci_iommu_write(strbuf->strbuf_pflush, vaddr); } +do_flush_sync: + /* If the device could not have possibly put dirty data into + * the streaming cache, no flush-flag synchronization needs + * to be performed. + */ + if (direction == PCI_DMA_TODEVICE) + return; + + PCI_STC_FLUSHFLAG_INIT(strbuf); pci_iommu_write(strbuf->strbuf_fsync, strbuf->strbuf_flushflag_pa); (void) pci_iommu_read(iommu->write_complete_reg); @@ -466,7 +512,7 @@ void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int /* Step 1: Kick data out of streaming buffers if necessary. */ if (strbuf->strbuf_enabled) - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); /* Step 2: Clear out first TSB entry. */ iopte_make_dummy(iommu, base); @@ -474,6 +520,8 @@ void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base, npages, ctx); + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); } @@ -613,7 +661,7 @@ int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int /* Step 4: Choose a context if necessary. */ ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); /* Step 5: Create the mappings. */ if (strbuf->strbuf_enabled) @@ -678,7 +726,7 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, /* Step 1: Kick data out of streaming buffers if necessary. */ if (strbuf->strbuf_enabled) - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); /* Step 2: Clear out first TSB entry. */ iopte_make_dummy(iommu, base); @@ -686,6 +734,8 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base, npages, ctx); + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); } @@ -724,7 +774,7 @@ void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size } /* Step 2: Kick data out of streaming buffers. */ - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -768,7 +818,7 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, i i--; npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) - bus_addr) >> IO_PAGE_SHIFT; - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); spin_unlock_irqrestore(&iommu->lock, flags); } diff --git a/arch/sparc64/kernel/pci_psycho.c b/arch/sparc64/kernel/pci_psycho.c index 3567fa879e1f..534320ef0db2 100644 --- a/arch/sparc64/kernel/pci_psycho.c +++ b/arch/sparc64/kernel/pci_psycho.c @@ -1212,7 +1212,7 @@ static void __init psycho_iommu_init(struct pci_controller_info *p) /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses. */ iommu->iommu_control = p->pbm_A.controller_regs + PSYCHO_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/pci_sabre.c b/arch/sparc64/kernel/pci_sabre.c index 5525d1ec4af8..53d333b4a4e8 100644 --- a/arch/sparc64/kernel/pci_sabre.c +++ b/arch/sparc64/kernel/pci_sabre.c @@ -1265,7 +1265,7 @@ static void __init sabre_iommu_init(struct pci_controller_info *p, /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses. */ iommu->iommu_control = p->pbm_A.controller_regs + SABRE_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/pci_schizo.c b/arch/sparc64/kernel/pci_schizo.c index e93fcadc3722..5753175b94e6 100644 --- a/arch/sparc64/kernel/pci_schizo.c +++ b/arch/sparc64/kernel/pci_schizo.c @@ -1753,7 +1753,7 @@ static void schizo_pbm_iommu_init(struct pci_pbm_info *pbm) /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses, SCHIZO has iommu ctx flushing. */ iommu->iommu_control = pbm->pbm_regs + SCHIZO_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c index 76ea6455433f..89f5e019f24c 100644 --- a/arch/sparc64/kernel/sbus.c +++ b/arch/sparc64/kernel/sbus.c @@ -117,17 +117,25 @@ static void iommu_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages #define STRBUF_TAG_VALID 0x02UL -static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages) +static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages, int direction) { unsigned long n; int limit; - iommu->strbuf_flushflag = 0UL; n = npages; while (n--) upa_writeq(base + (n << IO_PAGE_SHIFT), iommu->strbuf_regs + STRBUF_PFLUSH); + /* If the device could not have possibly put dirty data into + * the streaming cache, no flush-flag synchronization needs + * to be performed. + */ + if (direction == SBUS_DMA_TODEVICE) + return; + + iommu->strbuf_flushflag = 0UL; + /* Whoopee cushion! */ upa_writeq(__pa(&iommu->strbuf_flushflag), iommu->strbuf_regs + STRBUF_FSYNC); @@ -421,7 +429,7 @@ void sbus_unmap_single(struct sbus_dev *sdev, dma_addr_t dma_addr, size_t size, spin_lock_irqsave(&iommu->lock, flags); free_streaming_cluster(iommu, dma_base, size >> IO_PAGE_SHIFT); - sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -584,7 +592,7 @@ void sbus_unmap_sg(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int iommu = sdev->bus->iommu; spin_lock_irqsave(&iommu->lock, flags); free_streaming_cluster(iommu, dvma_base, size >> IO_PAGE_SHIFT); - sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -596,7 +604,7 @@ void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t base, size_t size = (IO_PAGE_ALIGN(base + size) - (base & IO_PAGE_MASK)); spin_lock_irqsave(&iommu->lock, flags); - sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -620,7 +628,7 @@ void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sg, int size = IO_PAGE_ALIGN(sg[i].dma_address + sg[i].dma_length) - base; spin_lock_irqsave(&iommu->lock, flags); - sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } diff --git a/include/asm-sparc64/iommu.h b/include/asm-sparc64/iommu.h index 5fd16e42a045..0de7a3da79cd 100644 --- a/include/asm-sparc64/iommu.h +++ b/include/asm-sparc64/iommu.h @@ -16,4 +16,6 @@ #define IOPTE_CACHE 0x0000000000000010UL /* Cached (in UPA E-cache) */ #define IOPTE_WRITE 0x0000000000000002UL /* Writeable */ +#define IOMMU_NUM_CTXS 4096 + #endif /* !(_SPARC_IOMMU_H) */ diff --git a/include/asm-sparc64/pbm.h b/include/asm-sparc64/pbm.h index 92999631c819..4c15610a2bac 100644 --- a/include/asm-sparc64/pbm.h +++ b/include/asm-sparc64/pbm.h @@ -15,6 +15,7 @@ #include #include #include +#include /* The abstraction used here is that there are PCI controllers, * each with one (Sabre) or two (PSYCHO/SCHIZO) PCI bus modules @@ -40,9 +41,6 @@ struct pci_iommu { */ spinlock_t lock; - /* Context allocator. */ - unsigned int iommu_cur_ctx; - /* IOMMU page table, a linear array of ioptes. */ iopte_t *page_table; /* The page table itself. */ int page_table_sz_bits; /* log2 of ow many pages does it map? */ @@ -87,6 +85,10 @@ struct pci_iommu { u16 flush; } alloc_info[PBM_NCLUSTERS]; + /* CTX allocation. */ + unsigned long ctx_lowest_free; + unsigned long ctx_bitmap[IOMMU_NUM_CTXS / (sizeof(unsigned long) * 8)]; + /* Here a PCI controller driver describes the areas of * PCI memory space where DMA to/from physical memory * are addressed. Drivers interrogate the PCI layer From 36839836e8132731e0cadddce452423036a1d5b3 Mon Sep 17 00:00:00 2001 From: Edgar E Iglesias Date: Tue, 31 May 2005 17:08:05 -0700 Subject: [PATCH 07/11] [IPSEC]: Fix esp_decap_data size verification in esp4. Signed-off-by: Edgar E Iglesias Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 053a883247ba..eae84cc39d3f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -478,7 +478,7 @@ static int __init esp4_init(void) { struct xfrm_decap_state decap; - if (sizeof(struct esp_decap_data) < + if (sizeof(struct esp_decap_data) > sizeof(decap.decap_data)) { extern void decap_data_too_small(void); From 88314ee73fd75eb32abdcb3119cd303c116d4500 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 May 2005 19:13:52 -0700 Subject: [PATCH 08/11] [SPARC64]: Refine PCI strbuf ctx-based flush. The initial peek read PIO of the match register is just a waste. Just do the flush writes first, as that is more efficient. Signed-off-by: David S. Miller --- arch/sparc64/kernel/pci_iommu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c index 1807876f8c36..2803bc7c2c79 100644 --- a/arch/sparc64/kernel/pci_iommu.c +++ b/arch/sparc64/kernel/pci_iommu.c @@ -422,14 +422,12 @@ static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, flushreg = strbuf->strbuf_ctxflush; matchreg = PCI_STC_CTXMATCH_ADDR(strbuf, ctx); - if (pci_iommu_read(matchreg) == 0) - goto do_flush_sync; - pci_iommu_write(flushreg, ctx); - if ((val = pci_iommu_read(matchreg)) == 0) + val = pci_iommu_read(matchreg); + val &= 0xffff; + if (!val) goto do_flush_sync; - val &= 0xffff; while (val) { if (val & 0x1) pci_iommu_write(flushreg, ctx); From b655913bf364603d17ad770dc4fb80e60555a255 Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Tue, 31 May 2005 22:34:00 -0700 Subject: [PATCH 09/11] [IA64] Cleanup compile warnings for ski config The attached patch cleans up a compilation warning when ACPI is turned off (i.e., when compiling for the Ski simulator). Signed-off-by: Tony Luck --- arch/ia64/kernel/mca.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 2c75741dcc66..736e328b5e61 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1103,8 +1103,6 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs) return IRQ_HANDLED; } -#endif /* CONFIG_ACPI */ - /* * ia64_mca_cpe_poll * @@ -1122,6 +1120,8 @@ ia64_mca_cpe_poll (unsigned long dummy) platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); } +#endif /* CONFIG_ACPI */ + /* * C portion of the OS INIT handler * From d8caebd285a084ee1e4d484ce597865228614067 Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Tue, 31 May 2005 22:37:00 -0700 Subject: [PATCH 10/11] [IA64] fix compilation warning in sys32_epoll_wait() This gets rid of an unused variable `error' in sys_ia32.c:sys32_epoll_wait() Getting rid of this one makes parsing the output of the kernecomp autobuild easier --- searching for `Error' to find a problem kept hitting this one, even though it's only a warning. Signed-off-by: Tony Luck --- arch/ia64/ia32/sys_ia32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 247a21c64aea..c1e20d65dd6c 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -2427,7 +2427,7 @@ sys32_epoll_wait(int epfd, struct epoll_event32 __user * events, int maxevents, { struct epoll_event *events64 = NULL; mm_segment_t old_fs = get_fs(); - int error, numevents, size; + int numevents, size; int evt_idx; int do_free_pages = 0; From 1e86d1c648508fd50e6c9960576b87906a7906ad Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 2 Jun 2005 14:11:37 +1000 Subject: [PATCH 11/11] [PATCH] ppc64: Fix result code handling in prom_init prom_init(), the trampoline code that "talks" to Open Firmware during early boot, has various issues with managing OF result codes. Some of my recent fixups in fact made the problem worse on some platforms. This patch reworks it all. Tested on g5, Maple, POWER3 and POWER5. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/prom_init.c | 102 ++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 41 deletions(-) diff --git a/arch/ppc64/kernel/prom_init.c b/arch/ppc64/kernel/prom_init.c index 3de950de3671..1ac531ba7056 100644 --- a/arch/ppc64/kernel/prom_init.c +++ b/arch/ppc64/kernel/prom_init.c @@ -211,13 +211,23 @@ struct { */ #define ADDR(x) (u32) ((unsigned long)(x) - offset) +/* + * Error results ... some OF calls will return "-1" on error, some + * will return 0, some will return either. To simplify, here are + * macros to use with any ihandle or phandle return value to check if + * it is valid + */ + +#define PROM_ERROR (-1u) +#define PHANDLE_VALID(p) ((p) != 0 && (p) != PROM_ERROR) +#define IHANDLE_VALID(i) ((i) != 0 && (i) != PROM_ERROR) + + /* This is the one and *ONLY* place where we actually call open * firmware from, since we need to make sure we're running in 32b * mode when we do. We switch back to 64b mode upon return. */ -#define PROM_ERROR (-1) - static int __init call_prom(const char *service, int nargs, int nret, ...) { int i; @@ -587,14 +597,13 @@ static void __init prom_send_capabilities(void) { unsigned long offset = reloc_offset(); ihandle elfloader; - int ret; elfloader = call_prom("open", 1, 1, ADDR("/packages/elf-loader")); if (elfloader == 0) { prom_printf("couldn't open /packages/elf-loader\n"); return; } - ret = call_prom("call-method", 3, 1, ADDR("process-elf-header"), + call_prom("call-method", 3, 1, ADDR("process-elf-header"), elfloader, ADDR(&fake_elf)); call_prom("close", 1, 0, elfloader); } @@ -646,7 +655,7 @@ static unsigned long __init alloc_up(unsigned long size, unsigned long align) base = _ALIGN_UP(base + 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); - if ((int)addr != PROM_ERROR) + if (addr != PROM_ERROR) break; addr = 0; if (align == 0) @@ -708,7 +717,7 @@ static unsigned long __init alloc_down(unsigned long size, unsigned long align, for(; base > RELOC(alloc_bottom); base = _ALIGN_DOWN(base - 0x100000, align)) { prom_debug(" trying: 0x%x\n\r", base); addr = (unsigned long)prom_claim(base, size, 0); - if ((int)addr != PROM_ERROR) + if (addr != PROM_ERROR) break; addr = 0; } @@ -902,18 +911,19 @@ static void __init prom_instantiate_rtas(void) { unsigned long offset = reloc_offset(); struct prom_t *_prom = PTRRELOC(&prom); - phandle prom_rtas, rtas_node; + phandle rtas_node; + ihandle rtas_inst; u32 base, entry = 0; u32 size = 0; prom_debug("prom_instantiate_rtas: start...\n"); - prom_rtas = call_prom("finddevice", 1, 1, ADDR("/rtas")); - prom_debug("prom_rtas: %x\n", prom_rtas); - if (prom_rtas == (phandle) -1) + rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas")); + prom_debug("rtas_node: %x\n", rtas_node); + if (!PHANDLE_VALID(rtas_node)) return; - prom_getprop(prom_rtas, "rtas-size", &size, sizeof(size)); + prom_getprop(rtas_node, "rtas-size", &size, sizeof(size)); if (size == 0) return; @@ -922,14 +932,18 @@ static void __init prom_instantiate_rtas(void) prom_printf("RTAS allocation failed !\n"); return; } - prom_printf("instantiating rtas at 0x%x", base); - rtas_node = call_prom("open", 1, 1, ADDR("/rtas")); - prom_printf("..."); + rtas_inst = call_prom("open", 1, 1, ADDR("/rtas")); + if (!IHANDLE_VALID(rtas_inst)) { + prom_printf("opening rtas package failed"); + return; + } + + prom_printf("instantiating rtas at 0x%x ...", base); if (call_prom("call-method", 3, 2, ADDR("instantiate-rtas"), - rtas_node, base) != PROM_ERROR) { + rtas_inst, base) != PROM_ERROR) { entry = (long)_prom->args.rets[1]; } if (entry == 0) { @@ -940,8 +954,8 @@ static void __init prom_instantiate_rtas(void) reserve_mem(base, size); - prom_setprop(prom_rtas, "linux,rtas-base", &base, sizeof(base)); - prom_setprop(prom_rtas, "linux,rtas-entry", &entry, sizeof(entry)); + prom_setprop(rtas_node, "linux,rtas-base", &base, sizeof(base)); + prom_setprop(rtas_node, "linux,rtas-entry", &entry, sizeof(entry)); prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); @@ -1062,7 +1076,7 @@ static void __init prom_initialize_tce_table(void) prom_printf("opening PHB %s", path); phb_node = call_prom("open", 1, 1, path); - if ( (long)phb_node <= 0) + if (phb_node == 0) prom_printf("... failed\n"); else prom_printf("... done\n"); @@ -1279,12 +1293,12 @@ static void __init prom_init_client_services(unsigned long pp) /* get a handle for the stdout device */ _prom->chosen = call_prom("finddevice", 1, 1, ADDR("/chosen")); - if ((long)_prom->chosen <= 0) + if (!PHANDLE_VALID(_prom->chosen)) prom_panic("cannot find chosen"); /* msg won't be printed :( */ /* get device tree root */ _prom->root = call_prom("finddevice", 1, 1, ADDR("/")); - if ((long)_prom->root <= 0) + if (!PHANDLE_VALID(_prom->root)) prom_panic("cannot find device tree root"); /* msg won't be printed :( */ } @@ -1356,9 +1370,8 @@ static int __init prom_find_machine_type(void) } /* Default to pSeries. We need to know if we are running LPAR */ rtas = call_prom("finddevice", 1, 1, ADDR("/rtas")); - if (rtas != (phandle) -1) { - unsigned long x; - x = prom_getproplen(rtas, "ibm,hypertas-functions"); + if (!PHANDLE_VALID(rtas)) { + int x = prom_getproplen(rtas, "ibm,hypertas-functions"); if (x != PROM_ERROR) { prom_printf("Hypertas detected, assuming LPAR !\n"); return PLATFORM_PSERIES_LPAR; @@ -1426,12 +1439,13 @@ static void __init prom_check_displays(void) * leave some room at the end of the path for appending extra * arguments */ - if (call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-10) < 0) + if (call_prom("package-to-path", 3, 1, node, path, + PROM_SCRATCH_SIZE-10) == PROM_ERROR) continue; prom_printf("found display : %s, opening ... ", path); ih = call_prom("open", 1, 1, path); - if (ih == (ihandle)0 || ih == (ihandle)-1) { + if (ih == 0) { prom_printf("failed\n"); continue; } @@ -1514,6 +1528,12 @@ static unsigned long __init dt_find_string(char *str) return 0; } +/* + * The Open Firmware 1275 specification states properties must be 31 bytes or + * less, however not all firmwares obey this. Make it 64 bytes to be safe. + */ +#define MAX_PROPERTY_NAME 64 + static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, unsigned long *mem_end) { @@ -1527,10 +1547,12 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, /* get and store all property names */ prev_name = RELOC(""); for (;;) { - - /* 32 is max len of name including nul. */ - namep = make_room(mem_start, mem_end, 32, 1); - if (call_prom("nextprop", 3, 1, node, prev_name, namep) <= 0) { + int rc; + + /* 64 is max len of name including nul. */ + namep = make_room(mem_start, mem_end, MAX_PROPERTY_NAME, 1); + rc = call_prom("nextprop", 3, 1, node, prev_name, namep); + if (rc != 1) { /* No more nodes: unwind alloc */ *mem_start = (unsigned long)namep; break; @@ -1555,12 +1577,6 @@ static void __init scan_dt_build_strings(phandle node, unsigned long *mem_start, } } -/* - * The Open Firmware 1275 specification states properties must be 31 bytes or - * less, however not all firmwares obey this. Make it 64 bytes to be safe. - */ -#define MAX_PROPERTY_NAME 64 - static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, unsigned long *mem_end) { @@ -1607,7 +1623,10 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, prev_name = RELOC(""); sstart = (char *)RELOC(dt_string_start); for (;;) { - if (call_prom("nextprop", 3, 1, node, prev_name, pname) <= 0) + int rc; + + rc = call_prom("nextprop", 3, 1, node, prev_name, pname); + if (rc != 1) break; /* find string offset */ @@ -1623,7 +1642,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, l = call_prom("getproplen", 2, 1, node, pname); /* sanity checks */ - if (l < 0) + if (l == PROM_ERROR) continue; if (l > MAX_PROPERTY_LENGTH) { prom_printf("WARNING: ignoring large property "); @@ -1771,17 +1790,18 @@ static void __init fixup_device_tree(void) /* Some G5s have a missing interrupt definition, fix it up here */ u3 = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000")); - if ((long)u3 <= 0) + if (!PHANDLE_VALID(u3)) return; i2c = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000/i2c@f8001000")); - if ((long)i2c <= 0) + if (!PHANDLE_VALID(i2c)) return; mpic = call_prom("finddevice", 1, 1, ADDR("/u3@0,f8000000/mpic@f8040000")); - if ((long)mpic <= 0) + if (!PHANDLE_VALID(mpic)) return; /* check if proper rev of u3 */ - if (prom_getprop(u3, "device-rev", &u3_rev, sizeof(u3_rev)) <= 0) + if (prom_getprop(u3, "device-rev", &u3_rev, sizeof(u3_rev)) + == PROM_ERROR) return; if (u3_rev != 0x35) return;